diff options
Diffstat (limited to 'final/libomptarget')
67 files changed, 12028 insertions, 0 deletions
diff --git a/final/libomptarget/CMakeLists.txt b/final/libomptarget/CMakeLists.txt new file mode 100644 index 0000000..7c57edd --- /dev/null +++ b/final/libomptarget/CMakeLists.txt @@ -0,0 +1,81 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build offloading library and related plugins. +# +##===----------------------------------------------------------------------===## + +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + message(FATAL_ERROR "Direct configuration not supported, please use parent directory!") +endif() + +# Add cmake directory to search for custom cmake functions. +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) + +if(OPENMP_STANDALONE_BUILD) + # Build all libraries into a common place so that tests can find them. + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +# Message utilities. +include(LibomptargetUtils) + +# Get dependencies for the different components of the project. +include(LibomptargetGetDependencies) + +# This is a list of all the targets that are supported/tested right now. +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") + +# Once the plugins for the different targets are validated, they will be added to +# the list of supported targets in the current system. +set (LIBOMPTARGET_SYSTEM_TARGETS "") + +# If building this library in debug mode, we define a macro to enable +# dumping progress messages at runtime. +string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE) +if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DOMPTARGET_DEBUG) + add_definitions(-g) + add_definitions(-O0) +endif() + +include_directories(include) + +# Build target agnostic offloading library. +add_subdirectory(src) + +# Retrieve the path to the resulting library so that it can be used for +# testing. +get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY) +if(NOT LIBOMPTARGET_LIBRARY_DIR) + set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +# Definitions for testing, for reuse when testing libomptarget-nvptx. +if(OPENMP_STANDALONE_BUILD) + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING + "Path to folder containing omp.h") + set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING + "Path to folder containing libomp.so") +else() + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src") +endif() + + +# Build offloading plugins and device RTLs if they are available. +add_subdirectory(plugins) +add_subdirectory(deviceRTLs) + +# Add tests. +add_subdirectory(test) diff --git a/final/libomptarget/README.txt b/final/libomptarget/README.txt new file mode 100644 index 0000000..8c0a837 --- /dev/null +++ b/final/libomptarget/README.txt @@ -0,0 +1,73 @@ + + README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget) + ====================================================================== + +How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget) +======================================================================== +In-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects +$ cd where-you-want-to-build +$ mkdir build && cd build +$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler> +$ make omptarget + +Out-of-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) +$ cd where-you-want-to-live/openmp/libomptarget +$ mkdir build && cd build +$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler> +$ make + +For details about building, please look at README.rst in the parent directory. + +Architectures Supported +======================= +The current library has been only tested in Linux operating system and the +following host architectures: +* Intel(R) 64 architecture +* IBM(R) Power architecture (big endian) +* IBM(R) Power architecture (little endian) +* ARM(R) AArch64 architecture (little endian) + +The currently supported offloading device architectures are: +* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures + +Supported RTL Build Configurations +================================== +Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8 + + --------------------------- + | gcc | clang | +--------------|------------|------------| +| Linux* OS | Yes(1) | Yes(2) | +----------------------------------------- + +(1) gcc version 4.8.2 or later is supported. +(2) clang version 3.7 or later is supported. + + +Front-end Compilers that work with this RTL +=========================================== + +The following compilers are known to do compatible code generation for +this RTL: + - clang (from https://github.com/clang-ykt ) + - clang (development branch at http://clang.llvm.org - several features still + under development) + +----------------------------------------------------------------------- + +Notices +======= +This library and related compiler support is still under development, so the +employed interface is likely to change in the future. + +*Other names and brands may be claimed as the property of others. diff --git a/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake new file mode 100644 index 0000000..3ef727f --- /dev/null +++ b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -0,0 +1,193 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# Try to detect in the system several dependencies required by the different +# components of libomptarget. These are the dependencies we have: +# +# libelf : required by some targets to handle the ELF files at runtime. +# libffi : required to launch target kernels given function and argument +# pointers. +# CUDA : required to control offloading to NVIDIA GPUs. + +include (FindPackageHandleStandardArgs) + +################################################################################ +# Looking for libelf... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR + NAMES + libelf.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH + PATH_SUFFIXES + libelf) + +find_library ( + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + NAMES + elf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBELF + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBELF_LIBRARIES) + +################################################################################ +# Looking for libffi... +################################################################################ +find_package(PkgConfig) + +pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi) + +find_path ( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR + NAMES + ffi.h + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS} + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH) + +# Don't bother look for the library if the header files were not found. +if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR) + find_library ( + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + NAMES + ffi + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS} + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) +endif() + +set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBFFI + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES) + +################################################################################ +# Looking for CUDA... +################################################################################ +if (CUDA_TOOLKIT_ROOT_DIR) + set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE) +endif() +find_package(CUDA QUIET) + +set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND}) +set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + +mark_as_advanced( + LIBOMPTARGET_DEP_CUDA_FOUND + LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS) + +################################################################################ +# Looking for CUDA Driver API... (needed for CUDA plugin) +################################################################################ + +find_library ( + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES + NAMES + cuda + PATHS + /lib64) + +# There is a libcuda.so in lib64/stubs that can be used for linking. +if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND) + # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this + # case CUDA_LIBRARIES contains additional linker arguments which breaks + # get_filename_component below. Fortunately, since that change the module + # exports CUDA_cudart_static_LIBRARY which points to a single file in the + # right directory. + set(cuda_library ${CUDA_LIBRARIES}) + if (DEFINED CUDA_cudart_static_LIBRARY) + set(cuda_library ${CUDA_cudart_static_LIBRARY}) + endif() + get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY) + find_library ( + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES + NAMES + cuda + HINTS + "${CUDA_LIBDIR}/stubs") +endif() + +find_package_handle_standard_args( + LIBOMPTARGET_DEP_CUDA_DRIVER + DEFAULT_MSG + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) + +mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) + +################################################################################ +# Looking for CUDA libdevice subdirectory +# +# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work +# out of the box. More info on http://bugs.debian.org/882505 +################################################################################ + +set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice) + +# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was +# already cached for it, or if it already has libdevice. Otherwise, on +# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs +# libdevice. +if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND + NOT EXISTS + "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") + find_program(LSB_RELEASE lsb_release) + if (LSB_RELEASE) + execute_process(COMMAND ${LSB_RELEASE} -is + OUTPUT_VARIABLE LSB_RELEASE_ID + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(candidate_dir /usr/lib/cuda) + if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu") + AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") + set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH + "Toolkit location." FORCE) + endif() + endif() +endif() diff --git a/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake new file mode 100644 index 0000000..5c69340 --- /dev/null +++ b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -0,0 +1,112 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# We use the compiler and linker provided by the user, attempt to use the one +# used to build libomptarget or just fail. +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE) + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) +else() + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS "${llvm_link}") + # Use llvm-link from the compiler directory. + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}") +else() + return() +endif() + +function(try_compile_bitcode output source) + set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu) + file(WRITE ${srcfile} "${source}\n") + set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc) + + # The remaining arguments are the flags to be tested. + # FIXME: Don't hardcode GPU version. This is currently required because + # Clang refuses to compile its default of sm_20 with CUDA 9. + execute_process( + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN} + --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile} + RESULT_VARIABLE result + OUTPUT_QUIET ERROR_QUIET) + if (result EQUAL 0) + set(${output} TRUE PARENT_SCOPE) + else() + set(${output} FALSE PARENT_SCOPE) + endif() +endfunction() + +# Save for which compiler we are going to do the following checks so that we +# can discard cached values if the user specifies a different value. +set(discard_cached FALSE) +if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND + NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}")) + set(discard_cached TRUE) +endif() +set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE) + +function(check_bitcode_compilation output source) + if (${discard_cached} OR NOT DEFINED ${output}) + message(STATUS "Performing Test ${output}") + # Forward additional arguments which contain the flags. + try_compile_bitcode(result "${source}" ${ARGN}) + set(${output} ${result} CACHE INTERNAL "" FORCE) + if(${result}) + message(STATUS "Performing Test ${output} - Success") + else() + message(STATUS "Performing Test ${output} - Failed") + endif() + endif() +endfunction() + +# These flags are required to emit LLVM Bitcode. We check them together because +# if any of them are not supported, there is no point in finding out which are. +set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) +set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) + +# It makes no sense to continue given that the compiler doesn't support +# emitting basic LLVM Bitcode +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) + return() +endif() + +set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required}) + +# Declaring external shared device variables might need an additional flag +# since Clang 7.0 and was entirely unsupported since version 4.0. +set(extern_device_shared_src "extern __device__ __shared__ int test;") + +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED) + set(compiler_flag_fcuda_rdc -fcuda-rdc) + set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc}) + check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) + + if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) + return() + endif() + + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}") +endif() + +# We can compile LLVM Bitcode from CUDA source code! +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE) diff --git a/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake new file mode 100644 index 0000000..d964903 --- /dev/null +++ b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake @@ -0,0 +1,28 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# void libomptarget_say(string message_to_user); +# - prints out message_to_user +macro(libomptarget_say message_to_user) + message(STATUS "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_warning_say(string message_to_user); +# - prints out message_to_user with a warning +macro(libomptarget_warning_say message_to_user) + message(WARNING "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_error_say(string message_to_user); +# - prints out message_to_user with an error and exits cmake +macro(libomptarget_error_say message_to_user) + message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}") +endmacro() diff --git a/final/libomptarget/deviceRTLs/CMakeLists.txt b/final/libomptarget/deviceRTLs/CMakeLists.txt new file mode 100644 index 0000000..7c75387 --- /dev/null +++ b/final/libomptarget/deviceRTLs/CMakeLists.txt @@ -0,0 +1,14 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +# ##===----------------------------------------------------------------------===## +# +# Build a device RTL for each available machine available. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(nvptx) diff --git a/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt new file mode 100644 index 0000000..1af0679 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -0,0 +1,186 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING + "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.") + +if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER) + find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER}) + if(NOT ALTERNATE_CUDA_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.") + endif() + set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE) +endif() + +# We can't use clang as nvcc host preprocessor, so we attempt to replace it with +# gcc. +if(CUDA_HOST_COMPILER MATCHES clang) + + find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc) + + if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.") + libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.") + return() + endif() + set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE) +endif() + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Building CUDA offloading device RTL.") + + # We really don't have any host code, so we don't need to care about + # propagating host flags. + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(cuda_src_files + src/cancel.cu + src/critical.cu + src/data_sharing.cu + src/libcall.cu + src/loop.cu + src/omptarget-nvptx.cu + src/parallel.cu + src/reduction.cu + src/sync.cu + src/task.cu + ) + + set(omp_data_objects src/omp_data.cu) + + # Get the compute capability the user requested or use SM_35 by default. + # SM_35 is what clang uses by default. + set(default_capabilities 35) + if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY) + set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES") + endif() + set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") + string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}) + + foreach(sm ${nvptx_sm_list}) + set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm}) + endforeach() + + # Activate RTL message dumps if requested by the user. + set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v) + endif() + + # NVPTX runtime library has to be statically linked. Dynamic linking is not + # yet supported by the CUDA toolchain on the device. + set(BUILD_SHARED_LIBS OFF) + set(CUDA_SEPARABLE_COMPILATION ON) + + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} + OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) + + # Install device RTL under the lib destination folder. + install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + + + # Check if we can create an LLVM bitcode implementation of the runtime library + # that could be inlined in the user application. For that we need to find + # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and + # an LLVM linker. + set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") + set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + + include(LibomptargetNVPTXBitcodeLibrary) + + set(bclib_default FALSE) + if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + set(bclib_default TRUE) + endif() + set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL + "Enable CUDA LLVM bitcode offloading device RTL.") + if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) + if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!") + endif() + libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") + + # Set flags for LLVM Bitcode compilation. + set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + else() + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + endif() + + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared + # to handle. Therefore, we use 'weak' instead. We are compiling only for the + # device, so it should be equivalent. + if(CUDA_VERSION_MAJOR GREATER 8) + set(bc_flags ${bc_flags} -Dnv_weak=weak) + endif() + + # Create target to build all Bitcode libraries. + add_custom_target(omptarget-nvptx-bc) + + # Generate a Bitcode library for all the compute capabilities the user requested. + foreach(sm ${nvptx_sm_list}) + set(cuda_arch --cuda-gpu-arch=sm_${sm}) + + # Compile CUDA files to bitcode. + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} + -c ${infile} -o ${outfile}-sm_${sm}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + + list(APPEND bc_files ${outfile}-sm_${sm}.bc) + endforeach() + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) + + add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + $<TARGET_FILE_DIR:omptarget-nvptx>) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() + endif() + + add_subdirectory(test) +else() + libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.") +endif() diff --git a/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt new file mode 100644 index 0000000..989a01f --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt @@ -0,0 +1,523 @@ + +**Design document for OpenMP reductions on the GPU** + +//Abstract: //In this document we summarize the new design for an OpenMP +implementation of reductions on NVIDIA GPUs. This document comprises +* a succinct background review, +* an introduction to the decoupling of reduction algorithm and + data-structure-specific processing routines, +* detailed illustrations of reduction algorithms used and +* a brief overview of steps we have made beyond the last implementation. + +**Problem Review** + +Consider a typical OpenMP program with reduction pragma. + +``` + double foo, bar; + #pragma omp parallel for reduction(+:foo, bar) + for (int i = 0; i < N; i++) { + foo+=A[i]; bar+=B[i]; + } +``` +where 'foo' and 'bar' are reduced across all threads in the parallel region. +Our primary goal is to efficiently aggregate the values of foo and bar in +such manner that +* makes the compiler logically concise. +* efficiently reduces within warps, threads, blocks and the device. + +**Introduction to Decoupling** +In this section we address the problem of making the compiler +//logically concise// by partitioning the task of reduction into two broad +categories: data-structure specific routines and algorithmic routines. + +The previous reduction implementation was highly coupled with +the specificity of the reduction element data structures (e.g., sizes, data +types) and operators of the reduction (e.g., addition, multiplication). In +our implementation we strive to decouple them. In our final implementations, +we could remove all template functions in our runtime system. + +The (simplified) pseudo code generated by LLVM is as follows: + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, + interWarpCpyFn) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + shuffleReduceFn and interWarpCpyFn are two auxiliary functions + generated to aid the runtime performing algorithmic steps + while being data-structure agnostic about ReduceData. + + In particular, shuffleReduceFn is a function that takes the following + inputs: + a. local copy of ReduceData + b. its lane_id + c. the offset of the lane_id which hosts a remote ReduceData + relative to the current one + d. an algorithm version paramter determining which reduction + algorithm to use. + This shuffleReduceFn retrieves the remote ReduceData through shuffle + intrinsics and reduces, using the algorithm specified by the 4th + parameter, the local ReduceData and with the remote ReduceData element + wise, and places the resultant values into the local ReduceData. + + Different reduction algorithms are implemented with different runtime + functions, but they all make calls to this same shuffleReduceFn to + perform the essential reduction step. Therefore, based on the 4th + parameter, this shuffleReduceFn will behave slightly differently to + cooperate with the runtime function to ensure correctness under + different circumstances. + + InterWarpCpyFn, as the name suggests, is a function that copies data + across warps. Its function is to tunnel all the thread private + ReduceData that is already reduced within a warp to a lane in the first + warp with minimal shared memory footprint. This is an essential step to + prepare for the last step of a block reduction. + + (Warp, block, device level reduction routines that utilize these + auxiliary functions will be discussed in the next section.) + + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar +``` + +**Reduction Algorithms** + +On the warp level, we have three versions of the algorithms: + +1. Full Warp Reduction + +``` +gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + for (int offset = WARPSIZE/2; offset > 0; offset /= 2) + ShuffleReduceFn(reduce_data, 0, offset, 0); +} +``` +ShuffleReduceFn is used here with lane_id set to 0 because it is not used +therefore we save instructions by not retrieving lane_id from the corresponding +special registers. The 4th parameters, which represents the version of the +algorithm being used here, is set to 0 to signify full warp reduction. + +In this version specified (=0), the ShuffleReduceFn behaves, per element, as +follows: + +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +reduce_elem = reduce_elem @ remote_elem; + +``` + +An illustration of this algorithm operating on a hypothetical 8-lane full-warp +would be: +{F74} +The coloring invariant follows that elements with the same color will be +combined and reduced in the next reduction step. As can be observed, no overhead +is present, exactly log(2, N) steps are needed. + +2. Contiguous Full Warp Reduction +``` +gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, + int lane_id) { + int curr_size; + int offset; + curr_size = size; + mask = curr_size/2; + while (offset>0) { + ShuffleReduceFn(reduce_data, lane_id, offset, 1); + curr_size = (curr_size+1)/2; + offset = curr_size/2; + } +} +``` + +In this version specified (=1), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (lane_id < offset) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` + +An important invariant (also a restriction on the starting state of the +reduction) is that this algorithm assumes that all unused ReduceData are +located in a contiguous subset of threads in a warp starting from lane 0. + +With the presence of a trailing active lane with an odd-numbered lane +id, its value will not be aggregated with any other lane. Therefore, +in order to preserve the invariant, such ReduceData is copied to the first lane +whose thread-local ReduceData has already being used in a previous reduction +and would therefore be useless otherwise. + +An illustration of this algorithm operating on a hypothetical 8-lane partial +warp woud be: +{F75} + +As illustrated, this version of the algorithm introduces overhead whenever +we have odd number of participating lanes in any reduction step to +copy data between lanes. + +3. Dispersed Partial Warp Reduction +``` +gpu_irregular_simt_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + int size, remote_id; + int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2; + do { + remote_id = find_the_next_active_lane_id_right_after_me(); + // the above function returns 0 of no active lane + // is present right after the current thread. + size = get_number_of_active_lanes_in_this_warp(); + logical_lane_id /= 2; + ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); + } while (logical_lane_id % 2 == 0 && size > 1); +``` + +There is no assumption made about the initial state of the reduction. +Any number of lanes (>=1) could be active at any position. The reduction +result is kept in the first active lane. + +In this version specified (=2), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (LaneId % 2 == 0 && Offset > 0) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` +We will proceed with a brief explanation for some arguments passed in, +it is important to notice that, in this section, we will introduce the +concept of logical_lane_id, and it is important to distinguish it +from physical lane_id as defined by nvidia. +1. //logical_lane_id//: as the name suggests, it refers to the calculated + lane_id (instead of the physical one defined by nvidia) that would make + our algorithm logically concise. A thread with logical_lane_id k means + there are (k-1) threads before it. +2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane + id of the remote lane from which we will retrieve the ReduceData. We + subtract (threadIdx+1) from it because we would like to maintain only one + underlying shuffle intrinsic (which is used to communicate among lanes in a + warp). This particular version of shuffle intrinsic we take accepts only + offsets, instead of absolute lane_id. Therefore the subtraction is performed + on the absolute lane_id we calculated to obtain the offset. + +This algorithm is slightly different in 2 ways and it is not, conceptually, a +generalization of the above algorithms. +1. It reduces elements close to each other. For instance, values in the 0th lane + is to be combined with that of the 1st lane; values in the 2nd lane is to be + combined with that of the 3rd lane. We did not use the previous algorithm + where the first half of the (partial) warp is reduced with the second half + of the (partial) warp. This is because, the mapping + f(x): logical_lane_id -> physical_lane_id; + can be easily calculated whereas its inverse + f^-1(x): physical_lane_id -> logical_lane_id + cannot and performing such reduction requires the inverse to be known. +2. Because this algorithm is agnostic about the positions of the lanes that are + active, we do not need to perform the coping step as in the second + algorithm. +An illustrative run would look like +{F76} +As observed, overhead is high because in each and every step of reduction, +logical_lane_id is recalculated; so is the remote_id. + +On a block level, we have implemented the following block reduce algorithm: + +``` +gpu_irregular_block_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shuflReduceFn, + kmp_InterWarpCopyFctPtr interWarpCpyFn, + int size) { + + int wid = threadIdx.x/WARPSIZE; + int lane_id = threadIdx.x%WARPSIZE; + + int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division + + unsigned tnum = __ballot(1); + int thread_num = __popc(tnum); + + //full warp reduction + if (thread_num == WARPSIZE) { + gpu_regular_warp_reduce(reduce_data, shuflReduceFn); + } + //partial warp reduction + if (thread_num < WARPSIZE) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, + lane_id); + } + //Gather all the reduced values from each warp + //to the first warp + //named_barrier inside this function to ensure + //correctness. It is effectively a sync_thread + //that won't deadlock. + interWarpCpyFn(reduce_data, warp_needed); + + //This is to reduce data gathered from each "warp master". + if (wid==0) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, + lane_id); + } + + return; +} +``` +In this function, no ShuffleReduceFn is directly called as it makes calls +to various versions of the warp-reduction functions. It first reduces +ReduceData warp by warp; in the end, we end up with the number of +ReduceData equal to the number of warps present in this thread +block. We then proceed to gather all such ReduceData to the first warp. + +As observed, in this algorithm we make use of the function InterWarpCpyFn, +which copies data from each of the "warp master" (0th lane of each warp, where +a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a +mathematical sense) the problem of reduction across warp masters in a block to +the problem of warp reduction which we already have solutions to. + +We can thus completely avoid the use of atomics to reduce in a threadblock. + +**Efficient Cross Block Reduce** + +The next challenge is to reduce values across threadblocks. We aim to do this +without atomics or critical sections. + +Let a kernel be started with TB threadblocks. +Let the GPU have S SMs. +There can be at most N active threadblocks per SM at any time. + +Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of +at most 'N' active threadblocks on SM s. Let each threadblock active on an SM +be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id) +uniquely identifies an active threadblock on the GPU. + +To efficiently implement cross block reduce, we first allocate an array for +each value to be reduced of size S*N (which is the maximum number of active +threadblocks at any time on the device). + +Each threadblock reduces its value to slot [s][id]. This can be done without +locking since no other threadblock can write to the same slot concurrently. + +As a final stage, we reduce the values in the array as follows: + +``` +// Compiler generated wrapper function for each target region with a reduction +clause. +target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1 + thread. + // Use dynamic parallelism to launch M teams, N threads as requested by the + user to execute the target region. + + target_function<<M, N>>(map_args) + + Reduce values in reduction_array + +``` + +**Comparison with Last Version** + + +The (simplified) pseudo code generated by LLVM on the host is as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + reduceFn is a pointer to a function that takes in two inputs + of type ReduceData, "reduces" them element wise, and places the + result in the first input: + reduceFn(ReduceData *a, ReduceData *b) + a = a @ b + + Every thread in the parallel region calls kmpc_reduce_nowait with + its private copy of reduceData. The runtime reduces across the + threads (using tree reduction on the operator 'reduceFn?) and stores + the final result in the master thread if successful. + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +On a GPU, a similar reduction may need to be performed across SIMT threads, +warps, and threadblocks. The challenge is to do so efficiently in a fashion +that is compatible with the LLVM OpenMP implementation. + +In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs, +the salient steps of the code generated are as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + status = can_block_reduce() + if status == 1: + reduce efficiently to thread 0 using shuffles and shared memory. + return 1 + else + cannot use efficient block reduction, fallback to atomics + return 2 + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +The function can_block_reduce() is defined as follows: + + +``` +int32_t can_block_reduce() { + int tid = GetThreadIdInTeam(); + int nt = GetNumberOfOmpThreads(tid); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { + return 0; + } + return 1; +} +``` + +This function permits the use of the efficient block reduction algorithm +using shuffles and shared memory (return 1) only if (a) all SIMT threads in +a warp are active (i.e., number of threads in the parallel region is a +multiple of 32) and (b) the number of threads in the parallel region +(set by the num_threads clause) equals blockDim.x. + +If either of these preconditions is not true, each thread in the threadblock +updates the global value using atomics. + +Atomics and compare-and-swap operations are expensive on many threaded +architectures such as GPUs and we must avoid them completely. + + +**Appendix: Implementation Details** + + +``` +// Compiler generated function. +reduceFn(ReduceData *a, ReduceData *b) + a->foo = a->foo + b->foo + a->bar = a->bar + b->bar + +// Compiler generated function. +swapAndReduceFn(ReduceData *thread_private, int lane) + ReduceData *remote = new ReduceData() + remote->foo = shuffle_double(thread_private->foo, lane) + remote->bar = shuffle_double(thread_private->bar, lane) + reduceFn(thread_private, remote) + +// OMP runtime function. +warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn): + offset = 16 + while (offset > 0) + swapAndReduceFn(thread_private, offset) + offset /= 2 + +// OMP runtime function. +warpReduce_irregular(): + ... + +// OMP runtime function. +kmpc_reduce_warp(reduceData, swapAndReduceFn) + if all_lanes_active: + warpReduce_regular(reduceData, swapAndReduceFn) + else: + warpReduce_irregular(reduceData, swapAndReduceFn) + if in_simd_region: + // all done, reduce to global in simd lane 0 + return 1 + else if in_parallel_region: + // done reducing to one value per warp, now reduce across warps + return 3 + +// OMP runtime function; one for each basic type. +kmpc_reduce_block_double(double *a) + if lane == 0: + shared[wid] = *a + named_barrier(1, num_threads) + if wid == 0 + block_reduce(shared) + if lane == 0 + *a = shared[0] + named_barrier(1, num_threads) + if wid == 0 and lane == 0 + return 1 // write back reduced result + else + return 0 // don't do anything + +``` + + + +``` +// Compiler generated code. + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn) + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 3: + ret = block_reduce_double(reduceData.foo) + if ret == 1: + foo += reduceData.foo + ret = block_reduce_double(reduceData.bar) + if ret == 1: + bar += reduceData.bar +``` + +**Notes** + + 1. This scheme requires that the CUDA OMP runtime can call llvm generated + functions. This functionality now works. + 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery + (including calls through function pointers) are optimized away. + 3. If we are reducing multiple to multiple variables in a parallel region, + the reduce operations are all performed in warpReduce_[ir]regular(). This + results in more instructions in the loop and should result in fewer + stalls due to data dependencies. Unfortunately we cannot do the same in + kmpc_reduce_block_double() without increasing shared memory usage. diff --git a/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu new file mode 100644 index 0000000..9f92e2d --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -0,0 +1,28 @@ +//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to be used in the implementation of OpenMP cancel. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); + // disabled + return FALSE; +} + +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); + // disabled + return FALSE; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/critical.cu b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu new file mode 100644 index 0000000..9bf2a30 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu @@ -0,0 +1,30 @@ +//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of critical with KMPC interface +// +//===----------------------------------------------------------------------===// + +#include <stdio.h> + +#include "omptarget-nvptx.h" + +EXTERN +void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_critical()\n"); + omp_set_lock((omp_lock_t *)lck); +} + +EXTERN +void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_end_critical()\n"); + omp_unset_lock((omp_lock_t *)lck); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu new file mode 100644 index 0000000..fb4e8ea --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -0,0 +1,582 @@ +//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of data sharing environments/ +// +//===----------------------------------------------------------------------===// +#include "omptarget-nvptx.h" +#include <stdio.h> + +// Warp ID in the CUDA block +INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; } +// Lane ID in the CUDA warp. +INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; } + +// Return true if this is the first active thread in the warp. +INLINE static bool IsWarpMasterActiveThread() { + unsigned long long Mask = __ACTIVEMASK(); + unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); + unsigned long long Sh = Mask << ShNum; + // Truncate Sh to the 32 lower bits + return (unsigned)Sh == 0; +} +// Return true if this is the master thread. +INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { + return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); +} + +/// Return the provided size aligned to the size of a pointer. +INLINE static size_t AlignVal(size_t Val) { + const size_t Align = (size_t)sizeof(void *); + if (Val & (Align - 1)) { + Val += Align; + Val &= ~(Align - 1); + } + return Val; +} + +#define DSFLAG 0 +#define DSFLAG_INIT 0 +#define DSPRINT(_flag, _str, _args...) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \ + } \ + } +#define DSPRINT0(_flag, _str) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \ + } \ + } + +// Initialize the shared data structures. This is expected to be called for the +// master thread and warp masters. \param RootS: A pointer to the root of the +// data sharing stack. \param InitialDataSize: The initial size of the data in +// the slot. +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, + size_t InitialDataSize) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + DSPRINT0(DSFLAG_INIT, + "Entering __kmpc_initialize_data_sharing_environment\n"); + + unsigned WID = getWarpId(); + DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); + + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + __kmpc_data_sharing_slot *RootS = + teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); + + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + + // We don't need to initialize the frame and active threads. + + DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize); + DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS); + DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", + (unsigned long long)RootS->DataEnd); + DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", + (unsigned long long)RootS->Next); + DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", + (unsigned long long)DataSharingState.SlotPtr[WID]); + DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", + (unsigned long long)DataSharingState.StackPtr[WID]); + + DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); +} + +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); + + // If the runtime has been elided, used __shared__ memory for master-worker + // data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); + + unsigned WID = getWarpId(); + unsigned CurActiveThreads = __ACTIVEMASK(); + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; + int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); + // Save the current values. + *SavedSharedSlot = SlotP; + *SavedSharedStack = StackP; + *SavedSharedFrame = FrameP; + *SavedActiveThreads = ActiveT; + + DSPRINT(DSFLAG, "Warp ID: %u\n", WID); + DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP); + DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); + + // Only the warp active master needs to grow the stack. + if (IsWarpMasterActiveThread()) { + // Save the current active threads. + ActiveT = CurActiveThreads; + + // Make sure we use aligned sizes to avoid rematerialization of data. + SharingDataSize = AlignVal(SharingDataSize); + // FIXME: The default data size can be assumed to be aligned? + SharingDefaultDataSize = AlignVal(SharingDefaultDataSize); + + // Check if we have room for the data in the current slot. + const uintptr_t CurrentStartAddress = (uintptr_t)StackP; + const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequiredEndAddress = + CurrentStartAddress + (uintptr_t)SharingDataSize; + + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); + DSPRINT(DSFLAG, "Current Start Address %016llx\n", + (unsigned long long)CurrentStartAddress); + DSPRINT(DSFLAG, "Current End Address %016llx\n", + (unsigned long long)CurrentEndAddress); + DSPRINT(DSFLAG, "Required End Address %016llx\n", + (unsigned long long)RequiredEndAddress); + DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT); + + // If we require a new slot, allocate it and initialize it (or attempt to + // reuse one). Also, set the shared stack and slot pointers to the new + // place. If we do not need to grow the stack, just adapt the stack and + // frame pointers. + if (CurrentEndAddress < RequiredEndAddress) { + size_t NewSize = (SharingDataSize > SharingDefaultDataSize) + ? SharingDataSize + : SharingDefaultDataSize; + __kmpc_data_sharing_slot *NewSlot = 0; + + // Attempt to reuse an existing slot. + if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { + uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + if (ExistingSlotSize >= NewSize) { + DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", + (unsigned long long)ExistingSlot); + NewSlot = ExistingSlot; + } else { + DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", + (unsigned long long)SlotP->Next); + free(ExistingSlot); + } + } + + if (!NewSlot) { + NewSlot = (__kmpc_data_sharing_slot *)malloc( + sizeof(__kmpc_data_sharing_slot) + NewSize); + DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", + (unsigned long long)NewSlot, NewSize); + } + + NewSlot->Next = 0; + NewSlot->DataEnd = &NewSlot->Data[NewSize]; + + SlotP->Next = NewSlot; + SlotP = NewSlot; + StackP = &NewSlot->Data[SharingDataSize]; + FrameP = &NewSlot->Data[0]; + } else { + + // Clean up any old slot that we may still have. The slot producers, do + // not eliminate them because that may be used to return data. + if (SlotP->Next) { + DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", + (unsigned long long)SlotP->Next); + free(SlotP->Next); + SlotP->Next = 0; + } + + FrameP = StackP; + StackP = (void *)RequiredEndAddress; + } + } + + // FIXME: Need to see the impact of doing it here. + __threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); + + // All the threads in this warp get the frame they should work with. + return FrameP; +} + +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + int32_t IsEntryPoint) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); + + unsigned WID = getWarpId(); + + if (IsEntryPoint) { + if (IsWarpMasterActiveThread()) { + DSPRINT0(DSFLAG, "Doing clean up\n"); + + // The master thread cleans the saved slot, because this is an environment + // only for the master. + __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) + ? *SavedSharedSlot + : DataSharingState.SlotPtr[WID]; + + if (S->Next) { + free(S->Next); + S->Next = 0; + } + } + + DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n"); + return; + } + + int32_t CurActive = __ACTIVEMASK(); + + // Only the warp master can restore the stack and frame information, and only + // if there are no other threads left behind in this environment (i.e. the + // warp diverged and returns in different places). This only works if we + // assume that threads will converge right after the call site that started + // the environment. + if (IsWarpMasterActiveThread()) { + int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Before restoring the stack\n"); + // Zero the bits in the mask. If it is still different from zero, then we + // have other threads that will return after the current ones. + ActiveT &= ~CurActive; + + DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", + (unsigned)CurActive, (unsigned)ActiveT); + + if (!ActiveT) { + // No other active threads? Great, lets restore the stack. + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; + + SlotP = *SavedSharedSlot; + StackP = *SavedSharedStack; + FrameP = *SavedSharedFrame; + ActiveT = *SavedActiveThreads; + + DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", + (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", + (unsigned long long)StackP); + DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", + (unsigned long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); + } + } + + // FIXME: Need to see the impact of doing it here. + __threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); + return; +} + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized) { + DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); + + // If the runtime has been elided, use __shared__ memory for master-worker + // data sharing. We're reusing the statically allocated data structure + // that is used for standard data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + // Get the frame used by the requested thread. + + unsigned SourceWID = SourceThreadID / WARPSIZE; + + DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID); + + void * volatile P = DataSharingState.FramePtr[SourceWID]; + DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); + return P; +} + +//////////////////////////////////////////////////////////////////////////////// +// Runtime functions for trunk data sharing scheme. +//////////////////////////////////////////////////////////////////////////////// + +INLINE static void data_sharing_init_stack_common() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + + for (int WID = 0; WID < WARPSIZE; WID++) { + __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). This function is called only by the MASTER thread of each +// team in non-SPMD mode. +EXTERN void __kmpc_data_sharing_init_stack() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + data_sharing_init_stack_common(); + omptarget_nvptx_globalArgs.Init(); +} + +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). This function is called in SPMD mode only. +EXTERN void __kmpc_data_sharing_init_stack_spmd() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + if (threadIdx.x == 0) + data_sharing_init_stack_common(); + + __threadfence_block(); +} + +INLINE static void* data_sharing_push_stack_common(size_t PushSize) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + // Only warp active master threads manage the stack. + bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; + + // Add worst-case padding to DataSize so that future stack allocations are + // correctly aligned. + const size_t Alignment = 8; + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; + + // Frame pointer must be visible to all workers in the same warp. + const unsigned WID = getWarpId(); + void *FrameP = 0; + int32_t CurActive = __ACTIVEMASK(); + + if (IsWarpMaster) { + // SlotP will point to either the shared memory slot or an existing + // global memory slot. + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + + // Check if we have room for the data in the current slot. + const uintptr_t StartAddress = (uintptr_t)StackP; + const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; + + // If we requested more data than there is room for in the rest + // of the slot then we need to either re-use the next slot, if one exists, + // or create a new slot. + if (EndAddress < RequestedEndAddress) { + __kmpc_data_sharing_slot *NewSlot = 0; + size_t NewSize = PushSize; + + // Allocate at least the default size for each type of slot. + // Master is a special case and even though there is only one thread, + // it can share more things with the workers. For uniformity, it uses + // the full size of a worker warp slot. + size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; + if (DefaultSlotSize > NewSize) + NewSize = DefaultSlotSize; + NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( + sizeof(__kmpc_data_sharing_slot) + NewSize, + "Global memory slot allocation."); + + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + NewSlot->PrevSlotStackPtr = StackP; + NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; + + // Make previous slot point to the newly allocated slot. + SlotP->Next = NewSlot; + // The current slot becomes the new slot. + SlotP = NewSlot; + // The stack pointer always points to the next free stack frame. + StackP = &NewSlot->Data[0] + PushSize; + // The frame pointer always points to the beginning of the frame. + FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; + } else { + // Add the data chunk to the current slot. The frame pointer is set to + // point to the start of the new frame held in StackP. + FrameP = DataSharingState.FramePtr[WID] = StackP; + // Reset stack pointer to the requested address. + StackP = (void *)RequestedEndAddress; + } + } + // Get address from lane 0. + ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0); + if (sizeof(FrameP) == 8) + ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0); + + return FrameP; +} + +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + return data_sharing_push_stack_common(DataSize); +} + +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) + ? DataSize + : WARPSIZE * DataSize; + + // Compute the start address of the frame of each thread in the warp. + uintptr_t FrameStartAddress = + (uintptr_t) data_sharing_push_stack_common(PushSize); + FrameStartAddress += (uintptr_t) (getLaneId() * DataSize); + return (void *)FrameStartAddress; +} + +// Pop the stack and free any memory which can be reclaimed. +// +// When the pop operation removes the last global memory slot, +// reclaim all outstanding global memory slots since it is +// likely we have reached the end of the kernel. +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + __threadfence_block(); + + if (GetThreadIdInBlock() % WARPSIZE == 0) { + unsigned WID = getWarpId(); + + // Current slot + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + + // Pointer to next available stack. + void *&StackP = DataSharingState.StackPtr[WID]; + + // Pop the frame. + StackP = FrameStart; + + // If the current slot is empty, we need to free the slot after the + // pop. + bool SlotEmpty = (StackP == &SlotP->Data[0]); + + if (SlotEmpty && SlotP->Prev) { + // Before removing the slot we need to reset StackP. + StackP = SlotP->PrevSlotStackPtr; + + // Remove the slot. + SlotP = SlotP->Prev; + SafeFree(SlotP->Next, "Free slot."); + SlotP->Next = 0; + } + } +} + +// Begin a data sharing context. Maintain a list of references to shared +// variables. This list of references to shared variables will be passed +// to one or more threads. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { + omptarget_nvptx_globalArgs.EnsureSize(nArgs); + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} + +// End a data sharing context. There is no need to have a list of refs +// to shared variables because the context in which those variables were +// shared has now ended. This should clean-up the list of references only +// without affecting the actual global storage of the variables. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_end_sharing_variables() { + omptarget_nvptx_globalArgs.DeInit(); +} + +// This function will return a list of references to global variables. This +// is how the workers will get a reference to the globalized variable. The +// members of this list will be passed to the outlined parallel function +// preserving the order. +// Called by all workers. +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} + +// This function is used to init static memory manager. This manager is used to +// manage statically allocated global memory. This memory is allocated by the +// compiler and used to correctly implement globalization of the variables in +// target, teams and distribute regions. +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, + int16_t is_shared, + const void **frame) { + if (is_shared) { + *frame = buf; + return; + } + if (isSPMDExecutionMode) { + if (GetThreadIdInBlock() == 0) { + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + } + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + return; + } + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "Must be called only in the target master thread."); + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + __threadfence(); +} + +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared) { + if (is_shared) + return; + if (isSPMDExecutionMode) { + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + if (GetThreadIdInBlock() == 0) { + omptarget_nvptx_simpleMemoryManager.Release(); + } + return; + } + __threadfence(); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "Must be called only in the target master thread."); + omptarget_nvptx_simpleMemoryManager.Release(); +} + diff --git a/final/libomptarget/deviceRTLs/nvptx/src/debug.h b/final/libomptarget/deviceRTLs/nvptx/src/debug.h new file mode 100644 index 0000000..d40cf3f --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/debug.h @@ -0,0 +1,289 @@ +//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// Usage guide +// +// PRINT0(flag, str) : if debug flag is on, print (no arguments) +// PRINT(flag, str, args) : if debug flag is on, print (arguments) +// DON(flag) : return true if debug flag is on +// +// ASSERT(flag, cond, str, args): if test flag is on, test the condition +// if the condition is false, print str+args +// and assert. +// CAUTION: cond may be evaluate twice +// AON(flag) : return true if test flag is on +// +// WARNING(flag, str, args) : if warning flag is on, print the warning +// WON(flag) : return true if warning flag is on +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_NVPTX_DEBUG_H_ +#define _OMPTARGET_NVPTX_DEBUG_H_ + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of debugging +//////////////////////////////////////////////////////////////////////////////// + +#define LD_SET_NONE 0ULL /* none */ +#define LD_SET_ALL -1ULL /* all */ + +// pos 1 +#define LD_SET_LOOP 0x1ULL /* basic loop */ +#define LD_SET_LOOPD 0x2ULL /* basic loop */ +#define LD_SET_PAR 0x4ULL /* basic parallel */ +#define LD_SET_PARD 0x8ULL /* basic parallel */ + +// pos 2 +#define LD_SET_SYNC 0x10ULL /* sync info */ +#define LD_SET_SYNCD 0x20ULL /* sync info */ +#define LD_SET_WAIT 0x40ULL /* state when waiting */ +#define LD_SET_TASK 0x80ULL /* print task info (high level) */ + +// pos 3 +#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ +#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ +#define LD_SET_ENV 0x400ULL /* env info */ +#define LD_SET_CANCEL 0x800ULL /* print cancel info */ + +// pos 4 +#define LD_SET_MEM 0x1000ULL /* malloc / free */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags to print selected output. + +// these are some examples of possible definitions that can be used for +// debugging. +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save +// on cuda buffer +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) + +#ifndef OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE +#elif OMPTARGET_NVPTX_DEBUG +#warning debug is used, not good for measurements +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of asserts +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LT_SET_NONE 0x0 /* unsafe */ +#define LT_SET_SAFETY \ + 0x1 /* check malloc type of stuff, input at creation, cheap */ +#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ +#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#ifndef OMPTARGET_NVPTX_TEST +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) +#else +#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of warnings +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LW_SET_ALL -1 +#define LW_SET_NONE 0x0 +#define LW_SET_ENV 0x1 +#define LW_SET_INPUT 0x2 +#define LW_SET_FUSSY 0x4 + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) +#else +#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) +#endif + +//////////////////////////////////////////////////////////////////////////////// +// implemtation for debug +//////////////////////////////////////////////////////////////////////////////// + +#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING +#include <stdio.h> +#include "option.h" + +template <typename... Arguments> +NOINLINE static void log(const char *fmt, Arguments... parameters) { + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), + (int)(threadIdx.x & 0x1F), parameters...); +} + +#endif +#if OMPTARGET_NVPTX_TEST +#include <assert.h> + +template <typename... Arguments> +NOINLINE static void check(bool cond, const char *fmt, + Arguments... parameters) { + if (!cond) + printf(fmt, (int)blockIdx.x, (int)threadIdx.x, + (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F), + parameters...); + assert(cond); +} + +NOINLINE static void check(bool cond) { assert(cond); } +#endif + +// set flags that are tested (inclusion properties) + +#define LD_ALL (LD_SET_ALL) + +#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) +#define LD_LOOPD (LD_SET_LOOPD) +#define LD_PAR (LD_SET_PAR | LD_SET_PARD) +#define LD_PARD (LD_SET_PARD) + +// pos 2 +#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) +#define LD_SYNCD (LD_SET_SYNCD) +#define LD_WAIT (LD_SET_WAIT) +#define LD_TASK (LD_SET_TASK) + +// pos 3 +#define LD_IO (LD_SET_IO | LD_SET_IOD) +#define LD_IOD (LD_SET_IOD) +#define LD_ENV (LD_SET_ENV) +#define LD_CANCEL (LD_SET_CANCEL) + +// pos 3 +#define LD_MEM (LD_SET_MEM) + +// implement +#if OMPTARGET_NVPTX_DEBUG + +#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag)) + +#define PRINT0(_flag, _str) \ + { \ + if (omptarget_device_environment.debug_level && DON(_flag)) { \ + log("<b %2d, t %4d, w %2d, l %2d>: " _str); \ + } \ + } + +#define PRINT(_flag, _str, _args...) \ + { \ + if (omptarget_device_environment.debug_level && DON(_flag)) { \ + log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args); \ + } \ + } +#else + +#define DON(_flag) (FALSE) +#define PRINT0(flag, str) +#define PRINT(flag, str, _args...) + +#endif + +// for printing without worring about precision, pointers... +#define P64(_x) ((unsigned long long)(_x)) + +//////////////////////////////////////////////////////////////////////////////// +// early defs for test +//////////////////////////////////////////////////////////////////////////////// + +#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) +#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) +#define LT_FUSSY (LT_SET_FUSSY) + +#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag)) { \ + check(_cond); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag)) { \ + check(_cond); \ + } \ + } + +#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag)) { \ + check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n"); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag)) { \ + check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", \ + _args); \ + } \ + } + +#else + +#define TON(_flag) (FALSE) +#define ASSERT0(_flag, _cond, _str) +#define ASSERT(_flag, _cond, _str, _args...) + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// early defs for warning + +#define LW_ALL (LW_SET_ALL) +#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) +#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) +#define LW_FUSSY (LW_SET_FUSSY) + +#if OMPTARGET_NVPTX_WARNING + +#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) +#define WARNING0(_flag, _str) \ + { \ + if (WON(_flag)) { \ + log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str); \ + } \ + } +#define WARNING(_flag, _str, _args...) \ + { \ + if (WON(_flag)) { \ + log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args); \ + } \ + } + +#else + +#define WON(_flag) (FALSE) +#define WARNING0(_flag, _str) +#define WARNING(_flag, _str, _args...) + +#endif + +#endif diff --git a/final/libomptarget/deviceRTLs/nvptx/src/interface.h b/final/libomptarget/deviceRTLs/nvptx/src/interface.h new file mode 100644 index 0000000..558860b --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -0,0 +1,564 @@ +//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// This file contains all the definitions that are relevant to +// the interface. The first section contains the interface as +// declared by OpenMP. A second section includes library private calls +// (mostly debug, temporary?) The third section includes the compiler +// specific interfaces. +// +//===----------------------------------------------------------------------===// + +#ifndef _INTERFACES_H_ +#define _INTERFACES_H_ + +#include "option.h" + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP interface +//////////////////////////////////////////////////////////////////////////////// + +typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ +typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ + +typedef enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +} omp_sched_t; + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +EXTERN double omp_get_wtick(void); +EXTERN double omp_get_wtime(void); + +EXTERN void omp_set_num_threads(int num); +EXTERN int omp_get_num_threads(void); +EXTERN int omp_get_max_threads(void); +EXTERN int omp_get_thread_limit(void); +EXTERN int omp_get_thread_num(void); +EXTERN int omp_get_num_procs(void); +EXTERN int omp_in_parallel(void); +EXTERN int omp_in_final(void); +EXTERN void omp_set_dynamic(int flag); +EXTERN int omp_get_dynamic(void); +EXTERN void omp_set_nested(int flag); +EXTERN int omp_get_nested(void); +EXTERN void omp_set_max_active_levels(int level); +EXTERN int omp_get_max_active_levels(void); +EXTERN int omp_get_level(void); +EXTERN int omp_get_active_level(void); +EXTERN int omp_get_ancestor_thread_num(int level); +EXTERN int omp_get_team_size(int level); + +EXTERN void omp_init_lock(omp_lock_t *lock); +EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_destroy_lock(omp_lock_t *lock); +EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_set_lock(omp_lock_t *lock); +EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_unset_lock(omp_lock_t *lock); +EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); +EXTERN int omp_test_lock(omp_lock_t *lock); +EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); +EXTERN omp_proc_bind_t omp_get_proc_bind(void); +EXTERN int omp_get_cancellation(void); +EXTERN void omp_set_default_device(int deviceId); +EXTERN int omp_get_default_device(void); +EXTERN int omp_get_num_devices(void); +EXTERN int omp_get_num_teams(void); +EXTERN int omp_get_team_num(void); +EXTERN int omp_is_initial_device(void); +EXTERN int omp_get_initial_device(void); +EXTERN int omp_get_max_task_priority(void); + +//////////////////////////////////////////////////////////////////////////////// +// file below is swiped from kmpc host interface +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// kmp specifc types +//////////////////////////////////////////////////////////////////////////////// + +typedef enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_balanced_chunk = 45, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + kmp_sched_distr_static_chunk_sched_static_chunkone = 93, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = + kmp_sched_distr_static_chunk_sched_static_chunkone, + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. + * Since we need to distinguish the three possible cases (no modifier, + * monotonic modifier, nonmonotonic modifier), we need separate bits for + * each modifier. The absence of monotonic does not imply nonmonotonic, + * especially since 4.5 says that the behaviour of the "no modifier" case + * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. + * + * Since we're passing a full 32 bit value, we can use a couple of high + * bits for these flags; out of paranoia we avoid the sign bit. + * + * These modifiers can be or-ed into non-static schedules by the compiler + * to pass the additional information. They will be stripped early in the + * processing in __kmp_dispatch_init when setting up schedules, so + * most of the code won't ever see schedules with these bits set. + */ + kmp_sched_modifier_monotonic = (1 << 29), + /**< Set if the monotonic schedule modifier was present */ + kmp_sched_modifier_nonmonotonic = (1 << 30), +/**< Set if the nonmonotonic schedule modifier was present */ + +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (enum kmp_sched_t)( \ + (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) \ + (((s)&kmp_sched_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ + 0) + +} kmp_sched_t; + +/*! + * Enum for accesseing the reserved_2 field of the ident_t struct below. + */ +enum { + /*! Bit set to 1 when in SPMD mode. */ + KMP_IDENT_SPMD_MODE = 0x01, + /*! Bit set to 1 when a simplified runtime is used. */ + KMP_IDENT_SIMPLE_RT_MODE = 0x02, +}; + +/*! + * The ident structure that describes a source location. + * The struct is identical to the one in the kmp.h file. + * We maintain the same data structure for compatibility. + */ +typedef int kmp_int32; +typedef struct ident { + kmp_int32 reserved_1; /**< might be used in Fortran; see above */ + kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC + identifies this union member */ + kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */ + kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */ + char const *psource; /**< String describing the source location. + The string is composed of semi-colon separated fields + which describe the source file, the function and a pair + of line numbers that delimit the construct. */ +} ident_t; + +// parallel defs +typedef ident_t kmp_Ident; +typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); +typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); +typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); +typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, + int16_t lane_offset, + int16_t shortCircuit); +typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width); +typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width, + int32_t reduce); + +// task defs +typedef struct kmp_TaskDescr kmp_TaskDescr; +typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); +typedef struct kmp_TaskDescr { + void *sharedPointerTable; // ptr to a table of shared var ptrs + kmp_TaskFctPtr sub; // task subroutine + int32_t partId; // unused + kmp_TaskFctPtr destructors; // destructor of c++ first private +} kmp_TaskDescr; +// task dep defs +#define KMP_TASKDEP_IN 0x1u +#define KMP_TASKDEP_OUT 0x2u +typedef struct kmp_TaskDep_Public { + void *addr; + size_t len; + uint8_t flags; // bit 0: in, bit 1: out +} kmp_TaskDep_Public; + +// flags that interpret the interface part of tasking flags +#define KMP_TASK_IS_TIED 0x1 +#define KMP_TASK_FINAL 0x2 +#define KMP_TASK_MERGED_IF0 0x4 /* unused */ +#define KMP_TASK_DESTRUCTOR_THUNK 0x8 + +// flags for task setup return +#define KMP_CURRENT_TASK_NOT_SUSPENDED 0 +#define KMP_CURRENT_TASK_SUSPENDED 1 + +// sync defs +typedef int32_t kmp_CriticalName[8]; + +//////////////////////////////////////////////////////////////////////////////// +// flags for kstate (all bits initially off) +//////////////////////////////////////////////////////////////////////////////// + +// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp) +#define KMP_REDUCTION_MASK 0x3 +#define KMP_SKIP_NEXT_CALL 0x4 +#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8 + +//////////////////////////////////////////////////////////////////////////////// +// data +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// external interface +//////////////////////////////////////////////////////////////////////////////// + +// query +EXTERN int32_t __kmpc_global_num_threads(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_bound_thread_num(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_bound_num_threads(kmp_Ident *loc); // missing +EXTERN int32_t __kmpc_in_parallel(kmp_Ident *loc); // missing + +// parallel +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, + int32_t num_threads); +// simd +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid, + int32_t simd_limit); +// aee ... not supported +// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr +// microtask, ...); +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, + uint32_t global_tid); +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); + +// proc bind +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, + int proc_bind); +EXTERN int omp_get_num_places(void); +EXTERN int omp_get_place_num_procs(int place_num); +EXTERN void omp_get_place_proc_ids(int place_num, int *ids); +EXTERN int omp_get_place_num(void); +EXTERN int omp_get_partition_num_places(void); +EXTERN void omp_get_partition_place_nums(int *place_nums); + +// for static (no chunk or chunk) +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk); + +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); + +// for dynamic +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t lower, int32_t upper, + int32_t incr, int32_t chunk); +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, uint32_t lower, + uint32_t upper, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int64_t lower, int64_t upper, + int64_t incr, int64_t chunk); +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, uint64_t lower, + uint64_t upper, int64_t incr, + int64_t chunk); + +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int32_t *plower, + int32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int64_t *plower, + int64_t *pupper, int64_t *pstride); +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride); + +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); + +// Support for reducing conditional lastprivate variables +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, + int32_t global_tid, + int32_t varNum, void *array); + +// reduction +EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); +EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit); +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); + +// sync barrier +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); + +// single +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); + +// sync +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_flush(kmp_Ident *loc); + +// vote +EXTERN int32_t __kmpc_warp_active_thread_mask(); + +// tasks +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, + uint32_t global_tid, int32_t flag, + size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr sub); +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, + int end_part); +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup); + +// cancel +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal); +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal); + +// non standard +EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr); +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing); +EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit(); +EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_kernel_end_parallel(); +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, + bool *IsFinal, + int32_t *LaneSource); +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes); +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); + + +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); +EXTERN void __kmpc_end_sharing_variables(); +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); + +// The slot used for data sharing by the master and worker threads. We use a +// complete (default size version and an incomplete one so that we allow sizes +// greater than the default). +struct __kmpc_data_sharing_slot { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[]; +}; +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS, + size_t InitialDataSize); +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, int32_t *SavedActiveThreads, int32_t IsEntryPoint); + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized); + +// SPMD execution mode interrogation function. +EXTERN int8_t __kmpc_is_spmd_exec_mode(); + +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, + int16_t is_shared, const void **res); + +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared); + +#endif diff --git a/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu new file mode 100644 index 0000000..63bf6b4 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -0,0 +1,515 @@ +//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP runtime functions that can be +// invoked by the user in an OpenMP region +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +// Timer precision is 1ns +#define TIMER_PRECISION ((double)1E-9) + +EXTERN double omp_get_wtick(void) { + PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION); + return TIMER_PRECISION; +} + +EXTERN double omp_get_wtime(void) { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + double rc = (double)nsecs * TIMER_PRECISION; + PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); + return rc; +} + +EXTERN void omp_set_num_threads(int num) { + // Ignore it for SPMD mode. + if (isSPMDMode()) + return; + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); + if (num <= 0) { + WARNING0(LW_INPUT, "expected positive num; ignore\n"); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); + currTaskDescr->NThreads() = num; + } +} + +EXTERN int omp_get_num_threads(void) { + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_max_threads(void) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + // We're already in parallel region. + return 1; // default is 1 thread avail + } + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + int rc = 1; // default is 1 thread avail + if (!currTaskDescr->InParallelRegion()) { + // Not currently in a parallel region, return what was set. + rc = currTaskDescr->NThreads(); + ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); + } + PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_limit(void) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return 0; // default is 0 + } + // per contention group.. meaning threads in current team + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + int rc = currTaskDescr->ThreadLimit(); + PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_num() { + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_num_procs(void) { + int rc = GetNumberOfProcsInDevice(isSPMDMode()); + PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_parallel(void) { + int rc = 0; + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + rc = 1; // SPMD mode is always in parallel. + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + if (currTaskDescr->InParallelRegion()) { + rc = 1; + } + } + PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_final(void) { + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitely specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + int rc = 1; + PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_dynamic(int flag) { + PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag); +} + +EXTERN int omp_get_dynamic(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_nested(int flag) { + PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", + flag); +} + +EXTERN int omp_get_nested(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_max_active_levels(int level) { + PRINT(LD_IO, + "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", + level); +} + +EXTERN int omp_get_max_active_levels(void) { + int rc = 1; + PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_level(void) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return parallelLevel; + } + int level = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + level++; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_level() returns %d\n", level); + return level; +} + +EXTERN int omp_get_active_level(void) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return 1; + } + int level = 0; // no active level parallelism + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->ThreadsInTeam() > 1) { + // has a parallel with more than one thread in team + level = 1; + break; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) + return level; +} + +EXTERN int omp_get_ancestor_thread_num(int level) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return level == 1 ? GetThreadIdInBlock() : 0; + } + int rc = -1; + if (level == 0) { + rc = 0; + } else if (level > 0) { + int totLevel = omp_get_level(); + if (level <= totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + int steps = totLevel - level; + PRINT(LD_IO, "backtrack %d steps\n", steps); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (DON(LD_IOD)) { + // print current state + omp_sched_t sched = currTaskDescr->GetRuntimeSched(); + PRINT(LD_ALL, + "task descr %s %d: %s, in par %d, rt sched %d," + " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", + "ancestor", steps, + (currTaskDescr->IsParallelConstruct() ? "par" : "task"), + (int)currTaskDescr->InParallelRegion(), (int)sched, + currTaskDescr->RuntimeChunkSize(), + (int)currTaskDescr->ThreadId(), + (int)currTaskDescr->ThreadsInTeam(), + (int)currTaskDescr->NThreads()); + } + + if (currTaskDescr->IsParallelConstruct()) { + // found the level + if (!steps) { + rc = currTaskDescr->ThreadId(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, + rc) + return rc; +} + +EXTERN int omp_get_team_size(int level) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return level == 1 ? GetNumberOfThreadsInBlock() : 1; + } + int rc = -1; + if (level == 0) { + rc = 1; + } else if (level > 0) { + int totLevel = omp_get_level(); + if (level <= totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + int steps = totLevel - level; + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (currTaskDescr->IsParallelConstruct()) { + if (!steps) { + // found the level + rc = currTaskDescr->ThreadsInTeam(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); + } + } + PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) + return rc; +} + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + *kind = omp_sched_static; + *modifier = 1; + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + *kind = currTaskDescr->GetRuntimeSched(); + *modifier = currTaskDescr->RuntimeChunkSize(); + } + PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", + (int)*kind, *modifier); +} + +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { + PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, + modifier); + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return; + } + if (kind >= omp_sched_static && kind < omp_sched_auto) { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + currTaskDescr->SetRuntimeSched(kind); + currTaskDescr->RuntimeChunkSize() = modifier; + PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", + (int)currTaskDescr->GetRuntimeSched(), + currTaskDescr->RuntimeChunkSize()); + } +} + +EXTERN omp_proc_bind_t omp_get_proc_bind(void) { + PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); + return omp_proc_bind_true; +} + +EXTERN int omp_get_num_places(void) { + PRINT0(LD_IO, "call omp_get_num_places() returns 0\n"); + return 0; +} + +EXTERN int omp_get_place_num_procs(int place_num) { + PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n"); + return 0; +} + +EXTERN void omp_get_place_proc_ids(int place_num, int *ids) { + PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n"); +} + +EXTERN int omp_get_place_num(void) { + PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n"); + return 0; +} + +EXTERN int omp_get_partition_num_places(void) { + PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n"); + return 0; +} + +EXTERN void omp_get_partition_place_nums(int *place_nums) { + PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n"); +} + +EXTERN int omp_get_cancellation(void) { + int rc = FALSE; // currently false only + PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_default_device(int deviceId) { + PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); +} + +EXTERN int omp_get_default_device(void) { + PRINT0(LD_IO, + "call omp_get_default_device() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_devices(void) { + PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_teams(void) { + int rc = GetNumberOfOmpTeams(); + PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_team_num() { + int rc = GetOmpTeamId(); + PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_is_initial_device(void) { + PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); + return 0; // 0 by def on device +} + +// Unspecified on the device. +EXTERN int omp_get_initial_device(void) { + PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); + return 0; +} + +// Unused for now. +EXTERN int omp_get_max_task_priority(void) { + PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n"); + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// +// locks +//////////////////////////////////////////////////////////////////////////////// + +#define __OMP_SPIN 1000 +#define UNSET 0 +#define SET 1 + +EXTERN void omp_init_lock(omp_lock_t *lock) { + omp_unset_lock(lock); + PRINT0(LD_IO, "call omp_init_lock()\n"); +} + +EXTERN void omp_destroy_lock(omp_lock_t *lock) { + omp_unset_lock(lock); + PRINT0(LD_IO, "call omp_destroy_lock()\n"); +} + +EXTERN void omp_set_lock(omp_lock_t *lock) { + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + + // TODO: not sure spinning is a good idea here.. + while (atomicCAS(lock, UNSET, SET) != UNSET) { + clock_t start = clock(); + clock_t now; + for (;;) { + now = clock(); + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= __OMP_SPIN * blockIdx.x) { + break; + } + } + } // wait for 0 to be the read value + + PRINT0(LD_IO, "call omp_set_lock()\n"); +} + +EXTERN void omp_unset_lock(omp_lock_t *lock) { + (void)atomicExch(lock, UNSET); + + PRINT0(LD_IO, "call omp_unset_lock()\n"); +} + +EXTERN int omp_test_lock(omp_lock_t *lock) { + // int atomicCAS(int* address, int compare, int val); + // (old == compare ? val : old) + int ret = atomicAdd(lock, 0); + + PRINT(LD_IO, "call omp_test_lock() return %d\n", ret); + + return ret; +} + +// for xlf Fotran +// Fotran, the return is LOGICAL type + +#define FLOGICAL long +EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() { + int ret = omp_is_initial_device(); + if (ret == 0) + return (FLOGICAL)0; + else + return (FLOGICAL)1; +} + +EXTERN int __xlf_omp_is_initial_device_i4() { + int ret = omp_is_initial_device(); + if (ret == 0) + return 0; + else + return 1; +} + +EXTERN long __xlf_omp_get_team_num_i4() { + int ret = omp_get_team_num(); + return (long)ret; +} + +EXTERN long __xlf_omp_get_num_teams_i4() { + int ret = omp_get_num_teams(); + return (long)ret; +} + +EXTERN void xlf_debug_print_int(int *p) { + printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_long(long *p) { + printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_float(float *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_double(double *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_addr(void *p) { + printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/loop.cu b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu new file mode 100644 index 0000000..998ce54 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -0,0 +1,806 @@ +//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// template class that encapsulate all the helper functions +// +// T is loop iteration type (32 | 64) (unsigned | signed) +// ST is the signed version of T +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +template <typename T, typename ST> class omptarget_nvptx_LoopSupport { +public: + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling with chunk + + // Generic implementation of OMP loop scheduling with static policy + /*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + + // helper function for static chunk + INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, + ST chunk, T entityId, T numberOfEntities) { + // each thread executes multiple chunks all of the same size, except + // the last one + + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + // Say ub' is the begining of the last chunk. Then who ever has a + // lower bound plus a multiple of the increment equal to ub' is + // the last one. + T beginingLastChunk = inputUb - (inputUb % chunk); + last = ((beginingLastChunk - lb) % stride) == 0; + } + + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling without chunk + + // helper function for static no chunk + INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, + ST &chunk, T entityId, + T numberOfEntities) { + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + last = lb <= inputUb && inputUb <= ub; + stride = loopSize; // make sure we only do 1 chunk per warp + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for Static Init + + INLINE static void for_static_init(int32_t gtid, int32_t schedtype, + int32_t *plastiter, T *plower, T *pupper, + ST *pstride, ST chunk, + bool IsSPMDExecutionMode, + bool IsRuntimeUninitialized) { + // When IsRuntimeUninitialized is true, we assume that the caller is + // in an L0 parallel region and that all worker threads participate. + + int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode); + + // Assume we are in teams region or that we use a single block + // per target region + ST numberOfActiveOMPThreads = GetNumberOfOmpThreads( + tid, IsSPMDExecutionMode, IsRuntimeUninitialized); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + PRINT(LD_LOOP, + "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " + "%d, num tids %d\n", + (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, + (int)numberOfActiveOMPThreads); + ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, + "current thread is not needed here; error"); + + // copy + int lastiter = 0; + T lb = *plower; + T ub = *pupper; + ST stride = *pstride; + // init + switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { + case kmp_sched_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_balanced_chunk: { + if (chunk > 0) { + // round up to make sure the chunk is enough to cover all iterations + T tripCount = ub - lb + 1; // +1 because ub is inclusive + T span = (tripCount + numberOfActiveOMPThreads - 1) / + numberOfActiveOMPThreads; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + if (ub > oldUb) + ub = oldUb; + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + case kmp_sched_distr_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), + GetNumberOfOmpTeams()); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), + GetNumberOfOmpTeams()); + break; + } + case kmp_sched_distr_static_chunk_sched_static_chunkone: { + ForStaticChunk(lastiter, lb, ub, stride, chunk, + numberOfActiveOMPThreads * GetOmpTeamId() + gtid, + GetNumberOfOmpTeams() * numberOfActiveOMPThreads); + break; + } + default: { + ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype); + PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", + (int)schedtype); + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } + // copy back + *plastiter = lastiter; + *plower = lb; + *pupper = ub; + *pstride = stride; + PRINT(LD_LOOP, + "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " + "%d\n", + (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), + (long long)(*plower), (long long)(*pupper), (long long)(*pstride), + (int)lastiter); + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch Init + + INLINE static int OrderedSchedule(kmp_sched_t schedule) { + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; + } + + INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, + ST chunk) { + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Expected non-SPMD mode + initialized runtime."); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); + T tnum = currTaskDescr->ThreadsInTeam(); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0(LT_FUSSY, threadId < tnum, + "current thread is not needed here; error"); + + /* Currently just ignore the monotonic and non-monotonic modifiers + * (the compiler isn't producing them * yet anyway). + * When it is we'll want to look at them somewhere here and use that + * information to add to our schedule choice. We shouldn't need to pass + * them on, they merely affect which schedule we can legally choose for + * various dynamic cases. (In paritcular, whether or not a stealing scheme + * is legal). + */ + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + // Process schedule. + if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + if (OrderedSchedule(schedule)) + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", + (long)tnum, (long long)tripCount, (int)schedule); + schedule = kmp_sched_static_chunk; + chunk = tripCount; // one thread gets the whole loop + } else if (schedule == kmp_sched_runtime) { + // process runtime + omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); + chunk = currTaskDescr->RuntimeChunkSize(); + switch (rtSched) { + case omp_sched_static: { + if (chunk > 0) + schedule = kmp_sched_static_chunk; + else + schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto: { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic: + case omp_sched_guided: { + schedule = kmp_sched_dynamic; + break; + } + } + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else { + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + ASSERT(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "unknown schedule %d & chunk %lld\n", (int)schedule, + (long long)chunk); + } + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_static_balanced_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + // round up to make sure the chunk is enough to cover all iterations + T span = (tripCount + tnum - 1) / tnum; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + if (ub > oldUb) + ub = oldUb; + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + // save data + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + if (chunk < 1) + chunk = 1; + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + __kmpc_barrier(loc, threadId); + if (tid == 0) { + omptarget_nvptx_threadPrivateContext->Cnt() = 0; + __threadfence_block(); + } + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 + ", chunk %" PRIu64 "\n", + (int)tnum, + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->Chunk(tid)); + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch next + + INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + hi = __SHFL_SYNC(active, hi, leader); + lo = __SHFL_SYNC(active, lo, leader); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; + } + + INLINE static uint64_t NextIter() { + unsigned int active = __ACTIVEMASK(); + int leader = __ffs(active) - 1; + int change = __popc(active); + unsigned lane_mask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt)); + unsigned int rank = __popc(active & lane_mask_lt); + uint64_t warp_res; + if (rank == 0) { + warp_res = atomicAdd( + (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(), + change); + } + warp_res = Shuffle(active, warp_res, leader); + return warp_res + rank; + } + + INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + T loopLowerBound, T loopUpperBound) { + T N = NextIter(); + lb = loopLowerBound + N * chunkSize; + ub = lb + chunkSize - 1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> + // NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + // a. + if (lb <= loopUpperBound && ub < loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, + (long long)ub, (long long)loopUpperBound); + return FINISHED; + } + + INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride) { + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Expected non-SPMD mode + initialized runtime."); + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + ASSERT0(LT_FUSSY, + gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), + "current thread is not needed here; error"); + // retrieve schedule + kmp_sched_t schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(tid); + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || + schedule == kmp_sched_static_nochunk) { + T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); + T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); + // finished? + if (myLb > ub) { + PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", + (long long)myLb, (long long)ub); + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); + *plower = myLb; + T myUb = myLb + chunk - 1; // Clang uses i <= ub + if (myUb > ub) + myUb = ub; + *pupper = myUb; + *plast = (int32_t)(myUb == ub); + + // increment next lower bound by the stride + ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; + PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", + (long long)*plower, (long long)*pupper); + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "bad sched"); + T myLb, myUb; + int finished = DynamicNextChunk( + myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + + if (finished == FINISHED) + return DISPATCH_FINISHED; + + // not finished (either not finished or last chunk) + *plast = (int32_t)(finished == LAST_CHUNK); + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + PRINT( + LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", + (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper, + (long long)*pstride, (int)*plast); + return DISPATCH_NOTFINISHED; + } + + INLINE static void dispatch_fini() { + // nothing + } + + //////////////////////////////////////////////////////////////////////////////// + // end of template class that encapsulate all the helper functions + //////////////////////////////////////////////////////////////////////////////// +}; + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// init +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, + int32_t schedule, int32_t lb, int32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, + int32_t schedule, uint32_t lb, uint32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, + int32_t schedule, int64_t lb, int64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, + int32_t schedule, uint64_t lb, uint64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +// next +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, + int32_t *p_last, uint32_t *p_lb, + uint32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, + int32_t *p_last, uint64_t *p_lb, + uint64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +// fini +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); + omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); + omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); + omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); + omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini(); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); + omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); + omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); + omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); + omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); + omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride, + int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); + omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); + omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride, + int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); + omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_4_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); + omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); + omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_8_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); + omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); + omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false, /*IsRuntimeUninitialized=*/true); +} + +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_for_static_fini\n"); +} + +namespace { +INLINE void syncWorkersInGenericMode(uint32_t NumThreads) { + int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 worker + // is started, so we don't need a barrier. + if (NumThreads > 1) { +#endif + named_sync(L1_BARRIER, WARPSIZE * NumWarps); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + } +#endif +} +}; // namespace + +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, + int32_t varNum, void *array) { + PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Expected non-SPMD mode + initialized runtime."); + + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); + uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); + for (unsigned i = 0; i < varNum; i++) { + // Reset buffer. + if (gtid == 0) + *Buffer = 0; // Reset to minimum loop iteration value. + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Atomic max of iterations. + uint64_t *varArray = (uint64_t *)array; + uint64_t elem = varArray[i]; + (void)atomicMax((unsigned long long int *)Buffer, + (unsigned long long int)elem); + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Read max value and update thread private array. + varArray[i] = *Buffer; + + // Barrier. + syncWorkersInGenericMode(NumThreads); + } +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu new file mode 100644 index 0000000..0700577 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -0,0 +1,66 @@ +//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the data objects used on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// global device envrionment +//////////////////////////////////////////////////////////////////////////////// + +__device__ omptarget_device_environmentTy omptarget_device_environment; + +//////////////////////////////////////////////////////////////////////////////// +// global data holding OpenMP state information +//////////////////////////////////////////////////////////////////////////////// + +__device__ + omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> + omptarget_nvptx_device_State[MAX_SM]; + +__device__ omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +__device__ __shared__ uint32_t usedMemIdx; +__device__ __shared__ uint32_t usedSlotIdx; + +__device__ __shared__ uint8_t parallelLevel; + +// Pointer to this team's OpenMP state object +__device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +//////////////////////////////////////////////////////////////////////////////// +// The team master sets the outlined parallel function in this variable to +// communicate with the workers. Since it is in shared memory, there is one +// copy of these variables for each kernel, instance, and team. +//////////////////////////////////////////////////////////////////////////////// +volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP kernel execution parameters +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ uint32_t execution_param; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing state +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// Scratchpad for teams reduction. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing related variables. +//////////////////////////////////////////////////////////////////////////////// +__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu new file mode 100644 index 0000000..7034d02 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -0,0 +1,185 @@ +//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the initialization code for the GPU +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ + omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT> + omptarget_nvptx_device_State[MAX_SM]; + +//////////////////////////////////////////////////////////////////////////////// +// init entry points +//////////////////////////////////////////////////////////////////////////////// + +INLINE static unsigned smid() { + unsigned id; + asm("mov.u32 %0, %%smid;" : "=r"(id)); + return id; +} + +EXTERN void __kmpc_kernel_init_params(void *Ptr) { + PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", + OMPTARGET_NVPTX_VERSION); + + SetTeamsReductionScratchpadPtr(Ptr); +} + +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { + PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", + OMPTARGET_NVPTX_VERSION); + ASSERT0(LT_FUSSY, RequiresOMPRuntime, + "Generic always requires initialized runtime."); + setExecutionParameters(Generic, RuntimeInitialized); + + int threadIdInBlock = GetThreadIdInBlock(); + ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), + "__kmpc_kernel_init() must be called by team master warp only!"); + PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); + + // Get a state object from the queue. + int slot = smid() % MAX_SM; + usedSlotIdx = slot; + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[slot].Dequeue(); + + // init thread private + int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); + omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); + + // init team context + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false); + // this thread will start execution... has to update its task ICV + // to point to the level zero task ICV. That ICV was init in + // InitTeamDescr() + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTeamDescr.LevelZeroTaskDescr()); + + // set number of threads and thread limit in team to started value + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); + currTaskDescr->ThreadLimit() = ThreadLimit; +} + +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n"); + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, + "Generic always requires initialized runtime."); + // Enqueue omp state object for use by another team. + int slot = usedSlotIdx; + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + // Done with work. Kill the workers. + omptarget_nvptx_workFn = 0; +} + +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing) { + PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); + + if (!RequiresOMPRuntime) { + // If OMP runtime is not required don't initialize OMP state. + setExecutionParameters(Spmd, RuntimeUninitialized); + if (GetThreadIdInBlock() == 0) { + parallelLevel = 0; + usedSlotIdx = smid() % MAX_SM; + } + __SYNCTHREADS(); + return; + } + setExecutionParameters(Spmd, RuntimeInitialized); + + // + // Team Context Initialization. + // + // In SPMD mode there is no master thread so use any cuda thread for team + // context initialization. + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + // Get a state object from the queue. + int slot = smid() % MAX_SM; + usedSlotIdx = slot; + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[slot].Dequeue(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // init team context + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true); + } + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + + // + // Initialize task descr for each thread. + // + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, + currTeamDescr.LevelZeroTaskDescr()); + newTaskDescr->ThreadLimit() = ThreadLimit; + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + + // init thread private from init value + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam()); + + if (RequiresDataSharing && threadId % WARPSIZE == 0) { + // Warp master innitializes data sharing environment. + unsigned WID = threadId / WARPSIZE; + __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( + WID, WID == WARPSIZE - 1); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + +EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() { + __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized()); +} + +EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { + // We're not going to pop the task descr stack of each thread since + // there are no more parallel regions in SPMD mode. + if (!RequiresOMPRuntime) + return; + + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + // Enqueue omp state object for use by another team. + int slot = usedSlotIdx; + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + } +} + +// Return true if the current target region is executed in SPMD mode. +EXTERN int8_t __kmpc_is_spmd_exec_mode() { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); + return isSPMDMode(); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h new file mode 100644 index 0000000..d23010e --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -0,0 +1,443 @@ +//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPTARGET_NVPTX_H +#define __OMPTARGET_NVPTX_H + +// std includes +#include <stdint.h> +#include <stdlib.h> + +#include <inttypes.h> + +// cuda includes +#include <cuda.h> +#include <math.h> + +// local includes +#include "debug.h" // debug +#include "interface.h" // interfaces with omp, compiler, and user +#include "option.h" // choices we have +#include "state-queue.h" +#include "support.h" + +#define OMPTARGET_NVPTX_VERSION 1.1 + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#define BARRIER_COUNTER 0 +#define ORDERED_COUNTER 1 + +// Macros for Cuda intrinsics +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 +#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down_sync((mask), (var), (delta), (width)) +#define __ACTIVEMASK() __activemask() +#else +#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) +#define __SHFL_DOWN_SYNC(mask, var, delta, width) \ + __shfl_down((var), (delta), (width)) +#define __ACTIVEMASK() __ballot(1) +#endif + +#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); +#define __SYNCTHREADS() __SYNCTHREADS_N(0) + +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() const { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs + omptarget_nvptx_globalArgs; + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, + // The size of the preallocated shared memory buffer per team + DS_Shared_Memory_Size = 128, +}; + +// Data structure to keep in shared memory that traces the current slot, stack, +// and frame pointer as well as the active threads that didn't exit the current +// environment. +struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; + int32_t ActiveThreads[DS_Max_Warp_Number]; +}; +// Additional worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; +// Additional master slot type which is initialized with the default master slot +// size of 4 bytes. +struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Slot_Size]; +}; +extern __device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { +public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched() const; + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() const { + return items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() const { + return items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &NThreads() { return items.nthreads; } + INLINE uint16_t &ThreadLimit() { return items.threadlimit; } + INLINE uint16_t &ThreadId() { return items.threadId; } + INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } + INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode); + INLINE void InitLevelOneTaskDescr(uint16_t tnum, + omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + INLINE void SaveLoopData(); + INLINE void RestoreLoopData() const; + +private: + // bits for flags: (6 used, 2 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + struct SavedLoopDescr_items { + int64_t loopUpperBound; + int64_t nextLowerBound; + int64_t chunk; + int64_t stride; + kmp_sched_t schedule; + } loopData; + + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + +private: + omptarget_nvptx_TaskDescr masterTaskICV; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class omptarget_nvptx_TeamDescr { +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(bool isSPMDExecutionMode); + + INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { + // If this is invoked by the master thread of the master warp then intialize + // it with a smaller slot. + if (IsMasterThread) { + // Do not initalize this slot again if it has already been initalized. + if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // We currently do not have a next slot. + master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; + } + // Do not initalize this slot again if it has already been initalized. + if (worker_rootS[wid].DataEnd == + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + + INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + +private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + uint64_t lastprivateIterBuffer; + + __align__(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; + __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; + // parallel + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + INLINE int64_t &Stride(int tid) { return stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE uint64_t &Cnt() { return cnt; } + +private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. + // parallel + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; + int64_t stride[MAX_THREADS_PER_TEAM]; + uint64_t cnt; +}; + +/// Device envrionment data +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +/// Memory manager for statically allocated memory. +class omptarget_nvptx_SimpleMemoryManager { +private: + __align__(128) struct MemDataTy { + volatile unsigned keys[OMP_STATE_COUNT]; + } MemData[MAX_SM]; + + INLINE static uint32_t hash(unsigned key) { + return key & (OMP_STATE_COUNT - 1); + } + +public: + INLINE void Release(); + INLINE const void *Acquire(const void *buf, size_t size); +}; + +//////////////////////////////////////////////////////////////////////////////// +// global device envrionment +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ omptarget_device_environmentTy omptarget_device_environment; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +extern __device__ __shared__ uint32_t usedMemIdx; +extern __device__ __shared__ uint32_t usedSlotIdx; +extern __device__ __shared__ uint8_t parallelLevel; +extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +extern __device__ __shared__ uint32_t execution_param; +extern __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// work function (outlined parallel/simd functions) and arguments. +// needed for L1 parallelism only. +//////////////////////////////////////////////////////////////////////////////// + +typedef void *omptarget_nvptx_WorkFn; +extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// + +#include "omptarget-nvptxi.h" +#include "supporti.h" + +#endif diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h new file mode 100644 index 0000000..27cbaad --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -0,0 +1,234 @@ +//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Task Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; + return (omp_sched_t)rc; +} + +INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { + // sched starts from 1..4; encode it as 0..3; so sub 1 here + uint8_t val = ((uint8_t)sched) - 1; + // clear current sched + items.flags &= ~TaskDescr_SchedMask; + // set new sched + items.flags |= val; +} + +INLINE void +omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // not in parallel + + items.flags = 0; + items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); + ; // threads: whatever was alloc by kernel + items.threadId = 0; // is master + items.threadsInTeam = 1; // sequential + items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 +} + +// This is called when all threads are started together in SPMD mode. +// OMP directives include target parallel, target distribute parallel for, etc. +INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( + uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // in L1 parallel + + items.flags = + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + items.nthreads = 0; // # threads for subsequent parallel region + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + items.threadsInTeam = tnum; + items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyData( + omptarget_nvptx_TaskDescr *sourceTaskDescr) { + items = sourceTaskDescr->items; +} + +INLINE void +omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { + CopyData(sourceTaskDescr); + prev = sourceTaskDescr->prev; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyParent( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyData(parentTaskDescr); + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyParent(parentTaskDescr); + items.flags = items.flags & ~TaskDescr_IsParConstr; + ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); +} + +INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( + omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) { + CopyParent(masterTaskDescr); + // overrwrite specific items; + items.flags |= + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + items.threadsInTeam = tnum; // set number of threads +} + +INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( + omptarget_nvptx_TaskDescr *workTaskDescr) { + Copy(workTaskDescr); + // + // overrwrite specific items; + // + // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). + // This is so that the serial master (first lane in the master warp) + // gets a threadId of 0. + // However, we know that this function is always called in a parallel + // region where only workers are active. The serial master thread + // never enters this region. When a parallel region is executed serially, + // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions + // are called, which never activate this region. + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) +} + +INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( + omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { + CopyParent(parentTaskDescr); + items.flags |= TaskDescr_InParL2P; // In L2+ parallelism + items.threadsInTeam = tnum; // set number of threads + items.threadId = tid; +} + +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { + loopData.loopUpperBound = + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + loopData.nextLowerBound = + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + loopData.schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); + loopData.stride = + omptarget_nvptx_threadPrivateContext->Stride(items.threadId); +} + +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { + omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = + loopData.loopUpperBound; + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + loopData.nextLowerBound; + omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = + loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + loopData.schedule; +} + +//////////////////////////////////////////////////////////////////////////////// +// Thread Private Context +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TaskDescr * +omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const { + ASSERT0( + LT_FUSSY, tid < MAX_THREADS_PER_TEAM, + "Getting top level, tid is larger than allocated data structure size"); + return topTaskDescr[tid]; +} + +INLINE void +omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { + // levelOneTaskDescr is init when starting the parallel region + // top task descr is NULL (team master version will be fixed separately) + topTaskDescr[tid] = NULL; + // no num threads value has been pushed + nextRegion.tnum[tid] = 0; + // the following don't need to be init here; they are init when using dyn + // sched + // current_Event, events_Number, chunk, num_Iterations, schedule +} + +//////////////////////////////////////////////////////////////////////////////// +// Team Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) { + levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode); +} + +//////////////////////////////////////////////////////////////////////////////// +// Get private data structure for thread +//////////////////////////////////////////////////////////////////////////////// + +// Utility routines for CUDA threads +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { + return omptarget_nvptx_threadPrivateContext->TeamContext(); +} + +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + return currTeamDescr.WorkDescr(); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { + return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); +} + +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode) { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); +} + +//////////////////////////////////////////////////////////////////////////////// +// Memory management runtime functions. +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_SimpleMemoryManager::Release() { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT, + "MemIdx is too big or uninitialized."); + MemDataTy &MD = MemData[usedSlotIdx]; + atomicExch((unsigned *)&MD.keys[usedMemIdx], 0); +} + +INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf, + size_t size) { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + const unsigned sm = usedSlotIdx; + MemDataTy &MD = MemData[sm]; + unsigned i = hash(GetBlockIdInKernel()); + while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) { + i = hash(i + 1); + } + usedSlotIdx = sm; + usedMemIdx = i; + return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/option.h b/final/libomptarget/deviceRTLs/nvptx/src/option.h new file mode 100644 index 0000000..37ab818 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/option.h @@ -0,0 +1,66 @@ +//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// GPU default options +// +//===----------------------------------------------------------------------===// +#ifndef _OPTION_H_ +#define _OPTION_H_ + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#define MAX_SM 56 +#else +#define OMP_STATE_COUNT 16 +#define MAX_SM 16 +#endif + +//////////////////////////////////////////////////////////////////////////////// +// algo options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// misc options (by def everythig here is device) +//////////////////////////////////////////////////////////////////////////////// + +#define EXTERN extern "C" __device__ +#define INLINE __inline__ __device__ +#define NOINLINE __noinline__ __device__ +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#endif diff --git a/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu new file mode 100644 index 0000000..8de8f59 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -0,0 +1,469 @@ +//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Parallel implemention in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +typedef struct ConvergentSimdJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t slimForNextSimd; +} ConvergentSimdJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent simd (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + *LaneId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + int32_t SimdLimit = + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); + job->slimForNextSimd = SimdLimit; + + int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource); + // reset simdlimit to avoid propagating to successive #simd + if (SimdLimitSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; + + // We cannot have more than the # of convergent threads. + if (SimdLimitSource > 0) + *NumLanes = min(ConvergentSize, SimdLimitSource); + else + *NumLanes = ConvergentSize; + ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", + (int)*NumLanes); + + // Set to true for lanes participating in the simd region. + bool isActive = false; + // Initialize state for active threads. + if (*LaneId < *NumLanes) { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + // install top descriptor from the thread for which the lanes are working. + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + sourceTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = + job->slimForNextSimd; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); +} + +typedef struct ConvergentParallelJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t tnumForNextPar; +} ConvergentParallelJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent parallelism (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, + bool *IsFinal, + int32_t *LaneSource) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); + uint32_t ConvergentMask = Mask; + int32_t ConvergentSize = __popc(ConvergentMask); + uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __ffs(WorkRemaining); + *IsFinal = __popc(WorkRemaining) == 1; + uint32_t lanemask_lt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + int32_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + job->tnumForNextPar = NumThreadsClause; + + int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource); + // reset numthreads to avoid propagating to successive #parallel + if (NumThreadsSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // We cannot have more than the # of convergent threads. + uint16_t NumThreads; + if (NumThreadsSource > 0) + NumThreads = min(ConvergentSize, NumThreadsSource); + else + NumThreads = ConvergentSize; + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + (int)NumThreads); + + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (OmpId < NumThreads) { + // init L2 task descriptor and storage for the L1 parallel task descriptor. + omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + job->tnumForNextPar; +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes parallel (1 static level only) +//////////////////////////////////////////////////////////////////////////////// + +INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, + uint16_t NThreadsICV, + uint16_t ThreadLimit) { + uint16_t ThreadsRequested = NThreadsICV; + if (NumThreadsClause != 0) { + ThreadsRequested = NumThreadsClause; + } + + uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); + if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { + ThreadsAvailable = ThreadLimit; + } + + uint16_t NumThreads = ThreadsAvailable; + if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { + NumThreads = ThreadsRequested; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp participate in the parallel region. Round down to a + // multiple of WARPSIZE since it is legal to do so in OpenMP. + if (NumThreads < WARPSIZE) { + NumThreads = 1; + } else { + NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); + } +#endif + + return NumThreads; +} + +// This routine is always called by the team master.. +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); + + omptarget_nvptx_workFn = WorkFn; + + // This routine is only called by the team master. The team master is + // the first thread of the last warp. It always has the logical thread + // id of 0 (since it is a shadow for the first worker thread). + const int threadId = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), + "cannot be called in a parallel region."); + if (currTaskDescr->InParallelRegion()) { + PRINT0(LD_PAR, "already in parallel: go seq\n"); + return; + } + + uint16_t &NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + + uint16_t NumThreads = + determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(), + currTaskDescr->ThreadLimit()); + + if (NumThreadsClause != 0) { + // Reset request to avoid propagating to successive #parallel + NumThreadsClause = 0; + } + + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + (int)NumThreads); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "only team master can create parallel"); + + // Set number of threads on work descriptor. + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads); +} + +// All workers call this function. Deactivate those not needed. +// Fn - the outlined work function to execute. +// returns True if this thread is active, else False. +// +// Only the worker threads call this routine. +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); + + // Work function and arguments for L1 parallel region. + *WorkFn = omptarget_nvptx_workFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n"); + return false; + } + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { + // init work descriptor from workdesccr + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + // init private from int value + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); + + isActive = true; + } + + return isActive; +} + +EXTERN void __kmpc_kernel_end_parallel() { + // pop stack + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes sequential +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), + "Expected SPMD mode with uninitialized runtime."); + __SYNCTHREADS(); + if (GetThreadIdInBlock() == 0) + ++parallelLevel; + __SYNCTHREADS(); + + return; + } + + // assume this is only called for nested parallel + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + + // unlike actual parallel, threads in the same team do not share + // the workTaskDescr in this case and num threads is fixed to 1 + + // get current task + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->SaveLoopData(); + + // allocate new task descriptor and copy value from current one, set prev to + // it + omptarget_nvptx_TaskDescr *newTaskDescr = + (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), + "new seq parallel task"); + newTaskDescr->CopyParent(currTaskDescr); + + // tweak values for serialized parallel case: + // - each thread becomes ID 0 in its serialized parallel, and + // - there is only one thread per team + newTaskDescr->ThreadId() = 0; + newTaskDescr->ThreadsInTeam() = 1; + + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); +} + +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, + uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), + "Expected SPMD mode with uninitialized runtime."); + __SYNCTHREADS(); + if (GetThreadIdInBlock() == 0) + --parallelLevel; + __SYNCTHREADS(); + return; + } + + // pop stack + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + // set new top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); + // free + SafeFree(currTaskDescr, (char *)"new seq parallel task"); + currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->RestoreLoopData(); +} + +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); + + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), + "Expected SPMD mode with uninitialized runtime."); + return parallelLevel; + } + + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + if (currTaskDescr->InL2OrHigherParallelRegion()) + return 2; + else if (currTaskDescr->InParallelRegion()) + return 1; + else + return 0; +} + +// This kmpc call returns the thread id across all teams. It's value is +// cached by the compiler and used when calling the runtime. On nvptx +// it's cheap to recalculate this value so we never use the result +// of this call. +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + return GetOmpThreadId(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); +} + +//////////////////////////////////////////////////////////////////////////////// +// push params +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, + int32_t num_threads) { + PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = + num_threads; +} + +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, + int32_t simd_limit) { + PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; +} + +// Do nothing. The host guarantees we started the requested number of +// teams and we only need inspection of gridDim. + +EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, + int32_t num_teams, int32_t thread_limit) { + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); + ASSERT0(LT_FUSSY, FALSE, + "should never have anything with new teams on device"); +} + +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, + int proc_bind) { + PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu new file mode 100644 index 0000000..dbe2d9e --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -0,0 +1,457 @@ +//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA +//-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include <complex.h> +#include <stdio.h> + +#include "omptarget-nvptx.h" + +// may eventually remove this +EXTERN +int32_t __gpu_block_reduce() { + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int nt = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ACTIVEMASK(); + if (tnum != (~0x0)) // assume swapSize is 32 + return 0; + return 1; +} + +EXTERN +int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + void *reduce_array_size, kmp_ReductFctPtr *reductFct, + kmp_CriticalName *lck) { + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + int numthread; + if (currTaskDescr->IsParallelConstruct()) { + numthread = + GetNumberOfOmpThreads(threadId, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)); + } else { + numthread = GetNumberOfOmpTeams(); + } + + if (numthread == 1) + return 1; + if (!__gpu_block_reduce()) + return 2; + if (threadIdx.x == 0) + return 1; + return 0; +} + +EXTERN +int32_t __kmpc_reduce_combined(kmp_Ident *loc) { + return threadIdx.x == 0 ? 2 : 0; +} + +EXTERN +int32_t __kmpc_reduce_simd(kmp_Ident *loc) { + return (threadIdx.x % 32 == 0) ? 1 : 0; +} + +EXTERN +void __kmpc_nvptx_end_reduce(int32_t global_tid) {} + +EXTERN +void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} + +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { + return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); +} + +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); + hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); + lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +INLINE static void gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct) { + for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +INLINE static void gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +INLINE static uint32_t +gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { + uint32_t lanemask_lt; + uint32_t lanemask_gt; + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); + uint32_t Liveness = __ACTIVEMASK(); + uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); + do { + Liveness = __ACTIVEMASK(); + remote_id = __ffs(Liveness & lanemask_gt); + size = __popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} + +EXTERN +int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + uint32_t Liveness = __ACTIVEMASK(); + if (Liveness == 0xffffffff) { + gpu_regular_warp_reduce(reduce_data, shflFct); + return GetThreadIdInBlock() % WARPSIZE == + 0; // Result on lane 0 of the simd warp. + } else { + return gpu_irregular_simd_reduce( + reduce_data, shflFct); // Result on the first active lane. + } +} + +INLINE +static int32_t nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + if (NumThreads == 1) + return 1; + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = BlockThreadId / WARPSIZE; + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; +#else + uint32_t Liveness = __ACTIVEMASK(); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return global_tid == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/isSPMDMode(), + /*isRuntimeUninitialized=*/isRuntimeUninitialized()); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} + +INLINE +static int32_t nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized) { + uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = + isSPMDExecutionMode + ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, + isRuntimeUninitialized) + : /*Master thread only*/ 1; + uint32_t TeamId = GetBlockIdInKernel(); + uint32_t NumTeams = GetNumberOfBlocksInKernel(); + __shared__ volatile bool IsLastTeam; + + // Team masters of all teams write to the scratchpad. + if (ThreadId == 0) { + unsigned int *timestamp = GetTeamsReductionTimestamp(); + char *scratchpad = GetTeamsReductionScratchpad(); + + scratchFct(reduce_data, scratchpad, TeamId, NumTeams); + __threadfence(); + + // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. + // It resets 'timestamp' back to 0 once the last team increments + // this counter. + unsigned val = atomicInc(timestamp, NumTeams - 1); + IsLastTeam = val == NumTeams - 1; + } + + // We have to wait on L1 barrier because in GENERIC mode the workers + // are waiting on barrier 0 for work. + // + // If we guard this barrier as follows it leads to deadlock, probably + // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); + uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; + named_sync(L1_BARRIER, SyncWarps * WARPSIZE); + + // If this team is not the last, quit. + if (/* Volatile read by all threads */ !IsLastTeam) + return 0; + + // + // Last team processing. + // + + // Threads in excess of #teams do not participate in reduction of the + // scratchpad values. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t ActiveThreads = NumThreads; + if (NumTeams < NumThreads) { + ActiveThreads = + (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); + } + if (ThreadId >= ActiveThreads) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = ThreadId / WARPSIZE; + + // Reduce across warps to the warp master. + if ((ActiveThreads % WARPSIZE == 0) || + (WarpId < WarpsNeeded - 1)) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (ActiveThreads > 1) // Partial warp but contiguous lanes + // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/ActiveThreads % WARPSIZE, + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + if (ActiveThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#else + if (ThreadId >= NumTeams) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + // Reduce across warps to the warp master. + uint32_t Liveness = __ACTIVEMASK(); + if (Liveness == 0xffffffff) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__popc(Liveness), + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; + if (ActiveThreads > WARPSIZE) { + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#endif // __CUDA_ARCH__ >= 700 + + return ThreadId == 0; +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(), + /*isRuntimeUninitialized=*/isRuntimeUninitialized()); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/true, + /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, + /*isSPMDExecutionMode=*/false, + /*isRuntimeUninitialized=*/true); +} + +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit) { + if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0) + return 0; + // The master thread of the team actually does the reduction. + while (atomicCAS((uint32_t *)crit, 0, 1)) + ; + return 1; +} + +EXTERN void +__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit) { + __threadfence_system(); + (void)atomicExch((uint32_t *)crit, 0); +} + diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h new file mode 100644 index 0000000..fe28328 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h @@ -0,0 +1,52 @@ +//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a queue to hand out OpenMP state objects to teams of +// one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#ifndef __STATE_QUEUE_H +#define __STATE_QUEUE_H + +#include <stdint.h> + +#include "option.h" // choices we have + +template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue { +private: + ElementType elements[SIZE]; + volatile ElementType *elementQueue[SIZE]; + volatile uint32_t head; + volatile uint32_t ids[SIZE]; + volatile uint32_t tail; + + static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; + INLINE uint32_t ENQUEUE_TICKET(); + INLINE uint32_t DEQUEUE_TICKET(); + INLINE static uint32_t ID(uint32_t ticket); + INLINE bool IsServing(uint32_t slot, uint32_t id); + INLINE void PushElement(uint32_t slot, ElementType *element); + INLINE ElementType *PopElement(uint32_t slot); + INLINE void DoneServing(uint32_t slot, uint32_t id); + +public: + INLINE omptarget_nvptx_Queue() {} + INLINE void Enqueue(ElementType *element); + INLINE ElementType *Dequeue(); +}; + +#include "state-queuei.h" + +#endif diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h new file mode 100644 index 0000000..3a1f49f --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h @@ -0,0 +1,90 @@ +//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of a queue to hand out OpenMP state +// objects to teams of one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#include "state-queue.h" + +template <typename ElementType, uint32_t SIZE> +INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() { + return atomicAdd((unsigned int *)&tail, 1); +} + +template <typename ElementType, uint32_t SIZE> +INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() { + return atomicAdd((unsigned int *)&head, 1); +} + +template <typename ElementType, uint32_t SIZE> +INLINE uint32_t +omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) { + return (ticket / SIZE) * 2; +} + +template <typename ElementType, uint32_t SIZE> +INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot, + uint32_t id) { + return atomicAdd((unsigned int *)&ids[slot], 0) == id; +} + +template <typename ElementType, uint32_t SIZE> +INLINE void +omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot, + ElementType *element) { + atomicExch((unsigned long long *)&elementQueue[slot], + (unsigned long long)element); +} + +template <typename ElementType, uint32_t SIZE> +INLINE ElementType * +omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) { + return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot], + (unsigned long long)0); +} + +template <typename ElementType, uint32_t SIZE> +INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot, + uint32_t id) { + atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID); +} + +template <typename ElementType, uint32_t SIZE> +INLINE void +omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) { + uint32_t ticket = ENQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket) + 1; + while (!IsServing(slot, id)) + ; + PushElement(slot, element); + DoneServing(slot, id); +} + +template <typename ElementType, uint32_t SIZE> +INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() { + uint32_t ticket = DEQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket); + while (!IsServing(slot, id)) + ; + ElementType *element = PopElement(slot); + // This is to populate the queue because of the lack of GPU constructors. + if (element == 0) + element = &elements[slot]; + DoneServing(slot, id); + return element; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/support.h b/final/libomptarget/deviceRTLs/nvptx/src/support.h new file mode 100644 index 0000000..9fe3749 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/support.h @@ -0,0 +1,92 @@ +//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// +enum ExecutionMode { + Generic = 0x00u, + Spmd = 0x01u, + ModeMask = 0x01u, +}; + +enum RuntimeMode { + RuntimeInitialized = 0x00u, + RuntimeUninitialized = 0x02u, + RuntimeMask = 0x02u, +}; + +INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +INLINE bool isGenericMode(); +INLINE bool isSPMDMode(); +INLINE bool isRuntimeUninitialized(); +INLINE bool isRuntimeInitialized(); + +//////////////////////////////////////////////////////////////////////////////// +// get info from machine +//////////////////////////////////////////////////////////////////////////////// + +// get low level ids of resources +INLINE int GetThreadIdInBlock(); +INLINE int GetBlockIdInKernel(); +INLINE int GetNumberOfBlocksInKernel(); +INLINE int GetNumberOfThreadsInBlock(); + +// get global ids to locate tread/team info (constant regardless of OMP) +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +INLINE int GetMasterThreadID(); +INLINE int GetNumberOfWorkersInTeam(); + +// get OpenMP thread and team ids +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized); // omp_thread_num +INLINE int GetOmpTeamId(); // omp_team_num + +// get OpenMP number of threads and team +INLINE int +GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized); // omp_num_threads +INLINE int GetNumberOfOmpTeams(); // omp_num_teams + +// get OpenMP number of procs +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); + +// masters +INLINE int IsTeamMaster(int ompThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +// safe alloc and free +INLINE void *SafeMalloc(size_t size, const char *msg); // check if success +INLINE void *SafeFree(void *ptr, const char *msg); +// pad to a alignment (power of 2 only) +INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment); +#define ADD_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) + (_bytes))) +#define SUB_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) - (_bytes))) + +//////////////////////////////////////////////////////////////////////////////// +// Named Barrier Routines +//////////////////////////////////////////////////////////////////////////////// +INLINE void named_sync(const int barrier, const int num_threads); + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// +INLINE unsigned int *GetTeamsReductionTimestamp(); +INLINE char *GetTeamsReductionScratchpad(); +INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); diff --git a/final/libomptarget/deviceRTLs/nvptx/src/supporti.h b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h new file mode 100644 index 0000000..b8f661c --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -0,0 +1,277 @@ +//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Wrapper implementation to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// + +INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { + execution_param = EMode; + execution_param |= RMode; +} + +INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; } + +INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } + +INLINE bool isRuntimeUninitialized() { + return (execution_param & RuntimeMask) == RuntimeUninitialized; +} + +INLINE bool isRuntimeInitialized() { + return (execution_param & RuntimeMask) == RuntimeInitialized; +} + +//////////////////////////////////////////////////////////////////////////////// +// Execution Modes based on location parameter fields +//////////////////////////////////////////////////////////////////////////////// + +INLINE bool checkSPMDMode(kmp_Ident *loc) { + if (!loc) + return isSPMDMode(); + + // If SPMD is true then we are not in the UNDEFINED state so + // we can return immediately. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // If not in SPMD mode and runtime required is a valid + // combination of flags so we can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // We are in underfined state. + return isSPMDMode(); +} + +INLINE bool checkGenericMode(kmp_Ident *loc) { + return !checkSPMDMode(loc); +} + +INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) { + if (!loc) + return isRuntimeUninitialized(); + + // If runtime is required then we know we can't be + // in the undefined mode. We can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // If runtime is required then we need to check is in + // SPMD mode or not. If not in SPMD mode then we end + // up in the UNDEFINED state that marks the orphaned + // functions. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // Check if we are in an UNDEFINED state. Undefined is denoted by + // non-SPMD + noRuntimeRequired which is a combination that + // cannot actually happen. Undefined states is used to mark orphaned + // functions. + return isRuntimeUninitialized(); +} + +INLINE bool checkRuntimeInitialized(kmp_Ident *loc) { + return !checkRuntimeUninitialized(loc); +} + +//////////////////////////////////////////////////////////////////////////////// +// support: get info from machine +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// Calls to the NVPTX layer (assuming 1D layout) +// +//////////////////////////////////////////////////////////////////////////////// + +INLINE int GetThreadIdInBlock() { return threadIdx.x; } + +INLINE int GetBlockIdInKernel() { return blockIdx.x; } + +INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } + +INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } + +//////////////////////////////////////////////////////////////////////////////// +// +// Calls to the Generic Scheme Implementation Layer (assuming 1D layout) +// +//////////////////////////////////////////////////////////////////////////////// + +// The master thread id is the first thread (lane) of the last warp. +// Thread id is 0 indexed. +// E.g: If NumThreads is 33, master id is 32. +// If NumThreads is 64, master id is 32. +// If NumThreads is 97, master id is 96. +// If NumThreads is 1024, master id is 992. +// +// Called in Generic Execution Mode only. +INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); } + +// The last warp is reserved for the master; other warps are workers. +// Called in Generic Execution Mode only. +INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } + +//////////////////////////////////////////////////////////////////////////////// +// get thread id in team + +// This function may be called in a parallel region by the workers +// or a serial region by the master. If the master (whose CUDA thread +// id is GetMasterThreadID()) calls this routine, we return 0 because +// it is a shadow for the first worker. +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { + // Implemented using control flow (predication) instead of with a modulo + // operation. + int tid = GetThreadIdInBlock(); + if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) + return 0; + else + return tid; +} + +//////////////////////////////////////////////////////////////////////////////// +// +// OpenMP Thread Support Layer +// +//////////////////////////////////////////////////////////////////////////////// + +INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized) { + // omp_thread_num + int rc; + + if (isRuntimeUninitialized) { + ASSERT0(LT_FUSSY, isSPMDExecutionMode, + "Uninitialized runtime with non-SPMD mode."); + // For level 2 parallelism all parallel regions are executed sequentially. + if (parallelLevel > 0) + rc = 0; + else + rc = GetThreadIdInBlock(); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + rc = currTaskDescr->ThreadId(); + } + return rc; +} + +INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, + bool isRuntimeUninitialized) { + // omp_num_threads + int rc; + + if (isRuntimeUninitialized) { + ASSERT0(LT_FUSSY, isSPMDExecutionMode, + "Uninitialized runtime with non-SPMD mode."); + // For level 2 parallelism all parallel regions are executed sequentially. + if (parallelLevel > 0) + rc = 1; + else + rc = GetNumberOfThreadsInBlock(); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + rc = currTaskDescr->ThreadsInTeam(); + } + + return rc; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team id linked to OpenMP + +INLINE int GetOmpTeamId() { + // omp_team_num + return GetBlockIdInKernel(); // assume 1 block per team +} + +INLINE int GetNumberOfOmpTeams() { + // omp_num_teams + return GetNumberOfBlocksInKernel(); // assume 1 block per team +} + +//////////////////////////////////////////////////////////////////////////////// +// Masters + +INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } + +//////////////////////////////////////////////////////////////////////////////// +// get OpenMP number of procs + +// Get the number of processors in the device. +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { + if (!isSPMDExecutionMode) + return GetNumberOfWorkersInTeam(); + return GetNumberOfThreadsInBlock(); +} + +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { + return GetNumberOfProcsInDevice(isSPMDExecutionMode); +} + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned long PadBytes(unsigned long size, + unsigned long alignment) // must be a power of 2 +{ + // compute the necessary padding to satisfy alignment constraint + ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, + "alignment %lu is not a power of 2\n", alignment); + return (~(unsigned long)size + 1) & (alignment - 1); +} + +INLINE void *SafeMalloc(size_t size, const char *msg) // check if success +{ + void *ptr = malloc(size); + PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n", + (unsigned long long)size, msg, (unsigned long long)ptr); + return ptr; +} + +INLINE void *SafeFree(void *ptr, const char *msg) { + PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); + free(ptr); + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// +// Named Barrier Routines +//////////////////////////////////////////////////////////////////////////////// + +INLINE void named_sync(const int barrier, const int num_threads) { + asm volatile("bar.sync %0, %1;" + : + : "r"(barrier), "r"(num_threads) + : "memory"); +} + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// + +INLINE unsigned int *GetTeamsReductionTimestamp() { + return static_cast<unsigned int *>(ReductionScratchpadPtr); +} + +INLINE char *GetTeamsReductionScratchpad() { + return static_cast<char *>(ReductionScratchpadPtr) + 256; +} + +INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) { + ReductionScratchpadPtr = ScratchpadPtr; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/sync.cu b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu new file mode 100644 index 0000000..c89dee2 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -0,0 +1,146 @@ +//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +//////////////////////////////////////////////////////////////////////////////// +// KMP Ordered calls +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_ordered\n"); +} + +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_end_ordered\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP Barriers +//////////////////////////////////////////////////////////////////////////////// + +// a team is a block: we can use CUDA native synchronization mechanism +// FIXME: what if not all threads (warps) participate to the barrier? +// We may need to implement it differently + +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { + PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); + __kmpc_barrier(loc_ref, tid); + PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); + return 0; +} + +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { + if (checkRuntimeUninitialized(loc_ref)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref), + "Expected SPMD mode with uninitialized runtime."); + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else { + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); + int numberOfActiveOMPThreads = GetNumberOfOmpThreads( + tid, checkSPMDMode(loc_ref), /*isRuntimeUninitialized=*/false); + if (numberOfActiveOMPThreads > 1) { + if (checkSPMDMode(loc_ref)) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else { + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + (int)numberOfActiveOMPThreads, (int)threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); + } + } // numberOfActiveOMPThreads > 1 + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); + } +} + +// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { + PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); + // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. + __SYNCTHREADS(); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); +} + +// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { + int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " + "%d\n", + (int)numberOfActiveOMPThreads, (int)threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP MASTER +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_master\n"); + return IsTeamMaster(global_tid); +} + +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_master\n"); + ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP SINGLE +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_single\n"); + // decide to implement single with master; master get the single + return IsTeamMaster(global_tid); +} + +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_single\n"); + // decide to implement single with master: master get the single + ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); + // sync barrier is explicitely called... so that is not a problem +} + +//////////////////////////////////////////////////////////////////////////////// +// Flush +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_flush(kmp_Ident *loc) { + PRINT0(LD_IO, "call kmpc_flush\n"); + __threadfence_system(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Vote +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_warp_active_thread_mask() { + PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); + return __ACTIVEMASK(); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/src/task.cu b/final/libomptarget/deviceRTLs/nvptx/src/task.cu new file mode 100644 index 0000000..a6eb9ab --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -0,0 +1,217 @@ +//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// explicit task structure uses +// omptarget_nvptx task +// kmp_task +// +// where kmp_task is +// - klegacy_TaskDescr <- task pointer +// shared -> X +// routine +// part_id +// descr +// - private (of size given by task_alloc call). Accessed by +// task+sizeof(klegacy_TaskDescr) +// * private data * +// - shared: X. Accessed by shared ptr in klegacy_TaskDescr +// * pointer table to shared variables * +// - end +// +//===----------------------------------------------------------------------===// + +#include "omptarget-nvptx.h" + +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( + kmp_Ident *loc, // unused + uint32_t global_tid, // unused + int32_t flag, // unused (because in our impl, all are immediately exec + size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, + kmp_TaskFctPtr taskSub) { + PRINT(LD_IO, + "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " + "fct 0x%llx)\n", + (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable, + (unsigned long long)taskSub); + // want task+priv to be a multiple of 8 bytes + size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); + sizeOfTaskInclPrivate += padForTaskInclPriv; + size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; + ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, + "need task descr of size %d to be a multiple of %d\n", + (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *)); + size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( + totSize, "explicit task descriptor"); + kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr; + ASSERT0(LT_FUSSY, + (uint64_t)newKmpTaskDescr == + (uint64_t)ADD_BYTES(newExplicitTaskDescr, + sizeof(omptarget_nvptx_TaskDescr)), + "bad size assumptions"); + // init kmp_TaskDescr + newKmpTaskDescr->sharedPointerTable = + (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); + newKmpTaskDescr->sub = taskSub; + newKmpTaskDescr->destructors = NULL; + PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", + (unsigned long long)newKmpTaskDescr, + (unsigned long long)newExplicitTaskDescr); + + return newKmpTaskDescr; +} + +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, + 0); +} + +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList) { + PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", + P64(newKmpTaskDescr)); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + + // 3. call sub + PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", + (unsigned long long)newKmpTaskDescr->sub, + (unsigned long long)newKmpTaskDescr); + newKmpTaskDescr->sub(0, newKmpTaskDescr); + PRINT(LD_TASK, "return from call task sub 0x%llx()\n", + (unsigned long long)newKmpTaskDescr->sub); + + // 4. pop context + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); + return 0; +} + +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", + (unsigned long long)newKmpTaskDescr); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + // 3... noting to call... is inline + // 4 & 5 ... done in complete +} + +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", + (unsigned long long)newKmpTaskDescr); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explict task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + // 2. get parent + omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); + // 3... noting to call... is inline + // 4. pop context + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); +} + +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, + int end_part) { + PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); + // do nothing: tasks are executed immediately, no yielding allowed + return 0; +} + +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); + // nothing to do as all our tasks are executed as final + return 0; +} + +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup) { + + // skip task entirely if empty iteration space + if (*lb > *ub) + return; + + // the compiler has already stored lb and ub in the kmp_TaskDescr structure + // as we are using a single task to execute the entire loop, we can leave + // the initial task_t untouched + + __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt new file mode 100644 index 0000000..33945d1 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt @@ -0,0 +1,26 @@ +if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang") + # Silently return, no need to annoy the user. + return() +endif() + +set(deps omptarget-nvptx omptarget omp) +if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB) + set(deps ${deps} omptarget-nvptx-bc) +endif() + +# Don't run by default. +set(EXCLUDE_FROM_ALL True) +# Run with only one thread to only launch one application to the GPU at a time. +add_openmp_testsuite(check-libomptarget-nvptx + "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${deps} ARGS -j1) + +set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING + "Extra compiler flags to send to the test compiler.") +set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS + "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING + "OpenMP compiler flags to use for testing libomptarget-nvptx.") + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) diff --git a/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c new file mode 100644 index 0000000..1fa9ae0 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c @@ -0,0 +1,38 @@ +// RUN: %compile-run-and-check + +#include <omp.h> +#include <stdio.h> + +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1; + + #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels) + { + // libomptarget-nvptx doesn't support cancellation. + cancellation = omp_get_cancellation(); + + // No support for dynamic adjustment of the number of threads. + omp_set_dynamic(1); + dynamic = omp_get_dynamic(); + + // libomptarget-nvptx doesn't support nested parallelism. + omp_set_nested(1); + nested = omp_get_nested(); + + omp_set_max_active_levels(42); + maxActiveLevels = omp_get_max_active_levels(); + } + + // CHECK: cancellation = 0 + printf("cancellation = %d\n", cancellation); + // CHECK: dynamic = 0 + printf("dynamic = %d\n", dynamic); + // CHECK: nested = 0 + printf("nested = %d\n", nested); + // CHECK: maxActiveLevels = 1 + printf("maxActiveLevels = %d\n", maxActiveLevels); + + return 0; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c new file mode 100644 index 0000000..dd17ae7 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c @@ -0,0 +1,55 @@ +// RUN: %compile-run-and-check + +#include <omp.h> +#include <stdio.h> + +#pragma omp declare target +static void putValueInParallel(int *ptr, int value) { + #pragma omp parallel + { + *ptr = value; + } +} + +static int getId() { + int id; + putValueInParallel(&id, omp_get_thread_num()); + return id; +} +#pragma omp end declare target + +const int MaxThreads = 1024; +const int Threads = 64; + +int main(int argc, char *argv[]) { + int master; + int check[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check[i] = 0; + } + + #pragma omp target map(master, check[:]) + { + master = getId(); + + #pragma omp parallel num_threads(Threads) + { + check[omp_get_thread_num()] = getId(); + } + } + + // CHECK: master = 0. + printf("master = %d.\n", master); + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < Threads) { + if (check[i] != i) { + printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]); + } + } else if (check[i] != 0) { + printf("invalid: check[%d] should be 0, is %d\n", i, check[i]); + } + } + + return 0; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg new file mode 100644 index 0000000..0774c25 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg @@ -0,0 +1,69 @@ +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def prepend_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget-nvptx' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.binary_dir + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_flags = " -I " + config.omp_header_directory + \ + " -L " + config.library_dir + \ + " --libomptarget-nvptx-path=" + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_flags = config.test_flags + \ + " -L " + config.omp_host_rtl_directory + +config.test_flags = config.test_flags + " " + config.test_extra_flags + +# Setup environment to find dynamic library at runtime. +prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":") +prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":") + +# Forbid fallback to host. +config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY" + +# substitutions +config.substitutions.append(("%compilexx-run-and-check", + "%compilexx-and-run | " + config.libomptarget_filecheck + " %s")) +config.substitutions.append(("%compile-run-and-check", + "%compile-and-run | " + config.libomptarget_filecheck + " %s")) +config.substitutions.append(("%compilexx-and-run", "%compilexx && %run")) +config.substitutions.append(("%compile-and-run", "%compile && %run")) + +config.substitutions.append(("%compilexx", + "%clangxx %openmp_flags %flags %s -o %t")) +config.substitutions.append(("%compile", + "%clang %openmp_flags %flags %s -o %t")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) +config.substitutions.append(("%flags", config.test_flags)) + +config.substitutions.append(("%run", "%t")) diff --git a/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in new file mode 100644 index 0000000..d9c14cb --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in @@ -0,0 +1,14 @@ +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" +config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" +config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@" +config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@" +config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" + +# Let the main config do the real work. +lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c new file mode 100644 index 0000000..edb00e0 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c @@ -0,0 +1,139 @@ +// RUN: %compile-run-and-check + +#include <omp.h> +#include <stdio.h> + +const int MaxThreads = 1024; +const int NumThreads = 64; + +int main(int argc, char *argv[]) { + int level = -1, activeLevel = -1; + // The expected value is -1, initialize to different value. + int ancestorTNumNeg = 1, teamSizeNeg = 1; + int ancestorTNum0 = -1, teamSize0 = -1; + // The expected value is -1, initialize to different value. + int ancestorTNum1 = 1, teamSize1 = 1; + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + int check4[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = check4[i] = 0; + } + + #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \ + map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \ + map(check1[:], check2[:], check3[:], check4[:]) + { + level = omp_get_level(); + activeLevel = omp_get_active_level(); + + // Expected to return -1. + ancestorTNumNeg = omp_get_ancestor_thread_num(-1); + teamSizeNeg = omp_get_team_size(-1); + + // Expected to return 0 and 1. + ancestorTNum0 = omp_get_ancestor_thread_num(0); + teamSize0 = omp_get_team_size(0); + + // Expected to return -1 because the requested level is larger than + // the nest level. + ancestorTNum1 = omp_get_ancestor_thread_num(1); + teamSize1 = omp_get_team_size(1); + + // Expecting active parallel region. + #pragma omp parallel num_threads(NumThreads) + { + int id = omp_get_thread_num(); + // Multiply return value of omp_get_level by 5 to avoid that this test + // passes if both API calls return wrong values. + check1[id] += omp_get_level() * 5 + omp_get_active_level(); + + // Expected to return 0 and 1. + check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); + // Expected to return the current thread num. + check2[id] += (omp_get_ancestor_thread_num(1) - id); + // Exepcted to return the current number of threads. + check2[id] += 3 * omp_get_team_size(1); + // Expected to return -1, see above. + check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2); + + // Expecting serialized parallel region. + #pragma omp parallel + { + #pragma omp atomic + check3[id] += omp_get_level() * 5 + omp_get_active_level(); + + // Expected to return 0 and 1. + int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); + // Expected to return the parent thread num. + check4Inc += (omp_get_ancestor_thread_num(1) - id); + // Exepcted to return the number of threads in the active parallel region. + check4Inc += 3 * omp_get_team_size(1); + // Exptected to return 0 and 1. + check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2); + // Expected to return -1, see above. + check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3); + + #pragma omp atomic + check4[id] += check4Inc; + } + } + } + + // CHECK: target: level = 0, activeLevel = 0 + printf("target: level = %d, activeLevel = %d\n", level, activeLevel); + // CHECK: level = -1: ancestorTNum = -1, teamSize = -1 + printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg); + // CHECK: level = 0: ancestorTNum = 0, teamSize = 1 + printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0); + // CHECK: level = 1: ancestorTNum = -1, teamSize = -1 + printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check active parallel region: + // omp_get_level() = 1, omp_get_active_level() = 1 + const int Expected1 = 6; + if (i < NumThreads) { + if (check1[i] != Expected1) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // 5 * 1 + 3 * 64 - 1 - 1 (see above) + const int Expected2 = 195; + if (i < NumThreads) { + if (check2[i] != Expected2) { + printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + // Check serialized parallel region: + // omp_get_level() = 2, omp_get_active_level() = 1 + const int Expected3 = 11; + if (i < NumThreads) { + if (check3[i] != Expected3) { + printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + + // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above) + const int Expected4 = 198; + if (i < NumThreads) { + if (check4[i] != Expected4) { + printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]); + } + } else if (check4[i] != 0) { + printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); + } + } + + return 0; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c new file mode 100644 index 0000000..8fd7ada --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c @@ -0,0 +1,72 @@ +// RUN: %compile-run-and-check + +#include <omp.h> +#include <stdio.h> + +const int MaxThreads = 1024; +const int NumThreads = 64; + +int main(int argc, char *argv[]) { + int inParallel = -1, numThreads = -1, threadNum = -1; + int check1[MaxThreads]; + int check2[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = 0; + } + + #pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) + { + inParallel = omp_in_parallel(); + numThreads = omp_get_num_threads(); + threadNum = omp_get_thread_num(); + + // Expecting active parallel region. + #pragma omp parallel num_threads(NumThreads) + { + int id = omp_get_thread_num(); + check1[id] += omp_get_num_threads() + omp_in_parallel(); + + // Expecting serialized parallel region. + #pragma omp parallel + { + // Expected to be 1. + int nestedInParallel = omp_in_parallel(); + // Expected to be 1. + int nestedNumThreads = omp_get_num_threads(); + // Expected to be 0. + int nestedThreadNum = omp_get_thread_num(); + #pragma omp atomic + check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; + } + } + } + + // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 + printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", + inParallel, numThreads, threadNum); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check that all threads reported + // omp_get_num_threads() = 64, omp_in_parallel() = 1. + int Expected = NumThreads + 1; + if (i < NumThreads) { + if (check1[i] != Expected) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // Check serialized parallel region. + if (i < NumThreads) { + if (check2[i] != 2) { + printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + } + + return 0; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c new file mode 100644 index 0000000..4a2f73f --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c @@ -0,0 +1,102 @@ +// RUN: %compile-run-and-check + +#include <stdio.h> +#include <omp.h> + +const int WarpSize = 32; +const int NumThreads1 = 1 * WarpSize; +const int NumThreads2 = 2 * WarpSize; +const int NumThreads3 = 3 * WarpSize; +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + int check4[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = check4[i] = 0; + } + + int maxThreads1 = -1; + int maxThreads2 = -1; + int maxThreads3 = -1; + + #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \ + map(maxThreads1, maxThreads2, maxThreads3) + { + #pragma omp parallel num_threads(NumThreads1) + { + check1[omp_get_thread_num()] += omp_get_num_threads(); + } + + // API method to set number of threads in parallel regions without + // num_threads() clause. + omp_set_num_threads(NumThreads2); + maxThreads1 = omp_get_max_threads(); + #pragma omp parallel + { + check2[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads2 = omp_get_max_threads(); + + // num_threads() clause should override nthreads-var ICV. + #pragma omp parallel num_threads(NumThreads3) + { + check3[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads3 = omp_get_max_threads(); + + // Effect from omp_set_num_threads() should still be visible. + #pragma omp parallel + { + check4[omp_get_thread_num()] += omp_get_num_threads(); + } + } + + // CHECK: maxThreads1 = 64 + printf("maxThreads1 = %d\n", maxThreads1); + // CHECK: maxThreads2 = 64 + printf("maxThreads2 = %d\n", maxThreads2); + // CHECK: maxThreads3 = 64 + printf("maxThreads3 = %d\n", maxThreads3); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < NumThreads1) { + if (check1[i] != NumThreads1) { + printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + if (i < NumThreads2) { + if (check2[i] != NumThreads2) { + printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + if (i < NumThreads3) { + if (check3[i] != NumThreads3) { + printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + + if (i < NumThreads2) { + if (check4[i] != NumThreads2) { + printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]); + } + } else if (check4[i] != 0) { + printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); + } + } + + return 0; +} diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c new file mode 100644 index 0000000..5e40bb5 --- /dev/null +++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c @@ -0,0 +1,77 @@ +// RUN: %compile-run-and-check + +#include <stdio.h> +#include <omp.h> + +const int WarpSize = 32; +const int ThreadLimit = 1 * WarpSize; +const int NumThreads2 = 2 * WarpSize; +const int NumThreads3 = 3 * WarpSize; +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = 0; + } + + int threadLimit = -1; + + #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \ + map(check1[:], check2[:], check3[:], threadLimit) + { + threadLimit = omp_get_thread_limit(); + + // All parallel regions should get as many threads as specified by the + // thread_limit() clause. + #pragma omp parallel + { + check1[omp_get_thread_num()] += omp_get_num_threads(); + } + + omp_set_num_threads(NumThreads2); + #pragma omp parallel + { + check2[omp_get_thread_num()] += omp_get_num_threads(); + } + + #pragma omp parallel num_threads(NumThreads3) + { + check3[omp_get_thread_num()] += omp_get_num_threads(); + } + } + + // CHECK: threadLimit = 32 + printf("threadLimit = %d\n", threadLimit); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < ThreadLimit) { + if (check1[i] != ThreadLimit) { + printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + if (i < ThreadLimit) { + if (check2[i] != ThreadLimit) { + printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + if (i < ThreadLimit) { + if (check3[i] != ThreadLimit) { + printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + } + + return 0; +} diff --git a/final/libomptarget/include/omptarget.h b/final/libomptarget/include/omptarget.h new file mode 100644 index 0000000..e92a94b --- /dev/null +++ b/final/libomptarget/include/omptarget.h @@ -0,0 +1,233 @@ +//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_H_ +#define _OMPTARGET_H_ + +#include <stdint.h> +#include <stddef.h> + +#define OFFLOAD_SUCCESS (0) +#define OFFLOAD_FAIL (~0) + +#define OFFLOAD_DEVICE_DEFAULT -1 +#define HOST_DEVICE -10 + +/// Data attributes for each data reference used in an OpenMP target region. +enum tgt_map_type { + // No flags + OMP_TGT_MAPTYPE_NONE = 0x000, + // copy data from host to device + OMP_TGT_MAPTYPE_TO = 0x001, + // copy data from device to host + OMP_TGT_MAPTYPE_FROM = 0x002, + // copy regardless of the reference count + OMP_TGT_MAPTYPE_ALWAYS = 0x004, + // force unmapping of data + OMP_TGT_MAPTYPE_DELETE = 0x008, + // map the pointer as well as the pointee + OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, + // pass device base address to kernel + OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, + // return base device address of mapped data + OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, + // private variable - not mapped + OMP_TGT_MAPTYPE_PRIVATE = 0x080, + // copy by value - not mapped + OMP_TGT_MAPTYPE_LITERAL = 0x100, + // mapping is implicit + OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + // member of struct, member given by [16 MSBs] - 1 + OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 +}; + +enum OpenMPOffloadingDeclareTargetFlags { + /// Mark the entry as having a 'link' attribute. + OMP_DECLARE_TARGET_LINK = 0x01, + /// Mark the entry as being a global constructor. + OMP_DECLARE_TARGET_CTOR = 0x02, + /// Mark the entry as being a global destructor. + OMP_DECLARE_TARGET_DTOR = 0x04 +}; + +/// This struct is a record of an entry point or global. For a function +/// entry point the size is expected to be zero +struct __tgt_offload_entry { + void *addr; // Pointer to the offload entry info (function or global) + char *name; // Name of the function or global + size_t size; // Size of the entry info (0 if it is a function) + int32_t flags; // Flags associated with the entry, e.g. 'link'. + int32_t reserved; // Reserved, to be used by the runtime library. +}; + +/// This struct is a record of the device image information +struct __tgt_device_image { + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end + __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries + __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) +}; + +/// This struct is a record of all the host code that may be offloaded to a +/// target. +struct __tgt_bin_desc { + int32_t NumDeviceImages; // Number of device types supported + __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type) + __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries + __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) +}; + +/// This struct contains the offload entries identified by the target runtime +struct __tgt_target_table { + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry + *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +#ifdef __cplusplus +extern "C" { +#endif + +int omp_get_num_devices(void); +int omp_get_initial_device(void); +void *omp_target_alloc(size_t size, int device_num); +void omp_target_free(void *device_ptr, int device_num); +int omp_target_is_present(void *ptr, int device_num); +int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, + size_t src_offset, int dst_device, int src_device); +int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device); +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, + size_t device_offset, int device_num); +int omp_target_disassociate_ptr(void *host_ptr, int device_num); + +/// adds a target shared library to the target execution image +void __tgt_register_lib(__tgt_bin_desc *desc); + +/// removes a target shared library from the target execution image +void __tgt_unregister_lib(__tgt_bin_desc *desc); + +// creates the host to target data mapping, stores it in the +// libomptarget.so internal structure (an entry in a stack of data maps) and +// passes the data to the device; +void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// passes data from the target, release target memory and destroys the +// host-target mapping (top entry from the stack of data maps) created by +// the last __tgt_target_data_begin +void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types); +void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +/// passes data to/from the target +void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// Performs the same actions as data_begin in case arg_num is non-zero +// and initiates run of offloaded region on target platform; if arg_num +// is non-zero after the region execution is done it also performs the +// same action as data_end above. The following types are used; this +// function returns 0 if it was able to transfer the execution to a +// target and an int different from zero otherwise. +int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t num_teams, + int32_t thread_limit); +int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t num_teams, int32_t thread_limit, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); + +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +#include <stdio.h> +#define DEBUGP(prefix, ...) \ + { \ + fprintf(stderr, "%s --> ", prefix); \ + fprintf(stderr, __VA_ARGS__); \ + } + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include <inttypes.h> +#define DPxMOD "0x%0*" PRIxPTR +#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr)) + +/* + * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x, + * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr)); + * + * DPxMOD expands to: + * "0x%0*" PRIxPTR + * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a + * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long: + * "0x%0*lu" + * + * Ultimately, the whole statement expands to: + * printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument + * // specifying the width of the output + * (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width + * // 8 digits for 32bit systems + * // 16 digits for 64bit + * (uintptr_t) ptr); + */ +#else +#define DEBUGP(prefix, ...) \ + {} +#endif + +#ifdef __cplusplus +#define EXTERN extern "C" +#else +#define EXTERN extern +#endif + +#endif // _OMPTARGET_H_ diff --git a/final/libomptarget/include/omptargetplugin.h b/final/libomptarget/include/omptargetplugin.h new file mode 100644 index 0000000..35fa059 --- /dev/null +++ b/final/libomptarget/include/omptargetplugin.h @@ -0,0 +1,92 @@ +//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an interface between target independent OpenMP offload +// runtime library libomptarget and target dependent plugin. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGETPLUGIN_H_ +#define _OMPTARGETPLUGIN_H_ + +#include <omptarget.h> + +#ifdef __cplusplus +extern "C" { +#endif + +// Return the number of available devices of the type supported by the +// target RTL. +int32_t __tgt_rtl_number_of_devices(void); + +// Return an integer different from zero if the provided device image can be +// supported by the runtime. The functionality is similar to comparing the +// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a +// lightweight query to determine if the RTL is suitable for an image without +// having to load the library, which can be expensive. +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image); + +// Initialize the specified device. In case of success return 0; otherwise +// return an error code. +int32_t __tgt_rtl_init_device(int32_t ID); + +// Pass an executable image section described by image to the specified +// device and prepare an address table of target entities. In case of error, +// return NULL. Otherwise, return a pointer to the built address table. +// Individual entries in the table may also be NULL, when the corresponding +// offload region is not supported on the target device. +__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, + __tgt_device_image *Image); + +// Allocate data on the particular target device, of the specified size. +// HostPtr is a address of the host data the allocated target data +// will be associated with (HostPtr may be NULL if it is not known at +// allocation time, like for example it would be for target data that +// is allocated by omp_target_alloc() API). Return address of the +// allocated data on the target that will be used by libomptarget.so to +// initialize the target data mapping structures. These addresses are +// used to generate a table of target variables to pass to +// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in +// case an error occurred on the target device. +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); + +// Pass the data content to the target device using the target address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size); + +// Retrieve the data content from the target device using its address. +// In case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size); + +// De-allocate the data referenced by target ptr on the device. In case of +// success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); + +// Transfer control to the offloaded entry Entry on the target device. +// Args and Offsets are arrays of NumArgs size of target addresses and +// offsets. An offset should be added to the target address before passing it +// to the outlined function on device side. In case of success, return zero. +// Otherwise, return an error code. +int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs); + +// Similar to __tgt_rtl_run_target_region, but additionally specify the +// number of teams to be created and a number of threads in each team. +int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t loop_tripcount); + +#ifdef __cplusplus +} +#endif + +#endif // _OMPTARGETPLUGIN_H_ diff --git a/final/libomptarget/plugins/CMakeLists.txt b/final/libomptarget/plugins/CMakeLists.txt new file mode 100644 index 0000000..8c3d571 --- /dev/null +++ b/final/libomptarget/plugins/CMakeLists.txt @@ -0,0 +1,72 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# - build a plugin for an ELF based generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(aarch64) +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) + diff --git a/final/libomptarget/plugins/aarch64/CMakeLists.txt b/final/libomptarget/plugins/aarch64/CMakeLists.txt new file mode 100644 index 0000000..e3a76b9 --- /dev/null +++ b/final/libomptarget/plugins/aarch64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an aarch64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") +else() + libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") +endif() diff --git a/final/libomptarget/plugins/common/elf_common.c b/final/libomptarget/plugins/common/elf_common.c new file mode 100644 index 0000000..dd85575 --- /dev/null +++ b/final/libomptarget/plugins/common/elf_common.c @@ -0,0 +1,73 @@ +//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Common ELF functionality for target plugins. +// Must be included in the plugin source file AFTER omptarget.h has been +// included and macro DP(...) has been defined. +// . +// +//===----------------------------------------------------------------------===// + +#if !(defined(_OMPTARGET_H_) && defined(DP)) +#error Include elf_common.c in the plugin source AFTER omptarget.h has been\ + included and macro DP(...) has been defined. +#endif + +#include <elf.h> +#include <libelf.h> + +// Check whether an image is valid for execution on target_id +static inline int32_t elf_check_machine(__tgt_device_image *image, + uint16_t target_id) { + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return 0; + } + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + // Check if ELF is the right kind. + if (elf_kind(e) != ELF_K_ELF) { + DP("Unexpected ELF type!\n"); + return 0; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + return MachineID == target_id; +} diff --git a/final/libomptarget/plugins/cuda/CMakeLists.txt b/final/libomptarget/plugins/cuda/CMakeLists.txt new file mode 100644 index 0000000..7210eec --- /dev/null +++ b/final/libomptarget/plugins/cuda/CMakeLists.txt @@ -0,0 +1,50 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.") + return() +elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") + return() +endif() + +libomptarget_say("Building CUDA offloading plugin.") + +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=CUDA) + +if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DCUDA_ERROR_REPORT) +endif() + +include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) + +add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + +target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + +# Report to the parent scope that we are building a plugin for CUDA. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp new file mode 100644 index 0000000..d265a87 --- /dev/null +++ b/final/libomptarget/plugins/cuda/src/rtl.cpp @@ -0,0 +1,763 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include <cassert> +#include <cstddef> +#include <cuda.h> +#include <list> +#include <string> +#include <vector> + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME CUDA +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +// Utility for retrieving and printing CUDA error string. +#ifdef CUDA_ERROR_REPORT +#define CUDA_ERR_STRING(err) \ + do { \ + const char *errStr; \ + cuGetErrorString(err, &errStr); \ + DP("CUDA error is: %s\n", errStr); \ + } while (0) +#else +#define CUDA_ERR_STRING(err) \ + {} +#endif + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + NONE +}; + +/// Use a single entity to encode a kernel and a set of flags +struct KernelTy { + CUfunction Func; + + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + + KernelTy(CUfunction _Func, int8_t _ExecutionMode) + : Func(_Func), ExecutionMode(_ExecutionMode) {} +}; + +/// Device envrionment data +/// Manually sync with the deviceRTL side for now, move to a dedicated header file later. +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list<KernelTy> KernelsList; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; + +public: + int NumberOfDevices; + std::vector<CUmodule> Modules; + std::vector<CUcontext> Contexts; + + // Device properties + std::vector<int> ThreadsPerBlock; + std::vector<int> BlocksPerGrid; + std::vector<int> WarpSize; + + // OpenMP properties + std::vector<int> NumTeams; + std::vector<int> NumThreads; + + // OpenMP Environment properties + int EnvNumTeams; + int EnvTeamLimit; + + //static int EnvNumThreads; + static const int HardTeamLimit = 1<<16; // 64k + static const int HardThreadLimit = 1024; + static const int DefaultNumTeams = 128; + static const int DefaultNumThreads = 128; + + // Record entry point associated with device + void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (auto &it : E.Entries) { + if (it.addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + int32_t size = E.Entries.size(); + + // Table is empty + if (!size) + return 0; + + __tgt_offload_entry *begin = &E.Entries[0]; + __tgt_offload_entry *end = &E.Entries[size - 1]; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = ++end; + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = 0; + } + + RTLDeviceInfoTy() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + DP("Start initializing CUDA\n"); + + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + DP("Error when initializing CUDA\n"); + CUDA_ERR_STRING(err); + return; + } + + NumberOfDevices = 0; + + err = cuDeviceGetCount(&NumberOfDevices); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device count\n"); + CUDA_ERR_STRING(err); + return; + } + + if (NumberOfDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + FuncGblEntries.resize(NumberOfDevices); + Contexts.resize(NumberOfDevices); + ThreadsPerBlock.resize(NumberOfDevices); + BlocksPerGrid.resize(NumberOfDevices); + WarpSize.resize(NumberOfDevices); + NumTeams.resize(NumberOfDevices); + NumThreads.resize(NumberOfDevices); + + // Get environment variables regarding teams + char *envStr = getenv("OMP_TEAM_LIMIT"); + if (envStr) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(envStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } else { + EnvTeamLimit = -1; + } + envStr = getenv("OMP_NUM_TEAMS"); + if (envStr) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(envStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } else { + EnvNumTeams = -1; + } + } + + ~RTLDeviceInfoTy() { + // Close modules + for (auto &module : Modules) + if (module) { + CUresult err = cuModuleUnload(module); + if (err != CUDA_SUCCESS) { + DP("Error when unloading CUDA module\n"); + CUDA_ERR_STRING(err); + } + } + + // Destroy contexts + for (auto &ctx : Contexts) + if (ctx) { + CUresult err = cuCtxDestroy(ctx); + if (err != CUDA_SUCCESS) { + DP("Error when destroying CUDA context\n"); + CUDA_ERR_STRING(err); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo; + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_check_machine(image, 190); // EM_CUDA = 190. +} + +int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { + + CUdevice cuDevice; + DP("Getting device %d\n", device_id); + CUresult err = cuDeviceGet(&cuDevice, device_id); + if (err != CUDA_SUCCESS) { + DP("Error when getting CUDA device with id = %d\n", device_id); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Create the context and save it to use whenever this device is selected. + err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error when creating a CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // Query attributes to determine number of threads/block and blocks/grid. + int maxGridDimX; + err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting max grid dimension, use default\n"); + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; + DP("Using %d CUDA blocks per grid\n", maxGridDimX); + } else { + DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; + DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); + } + + // We are only exploiting threads along the x axis. + int maxBlockDimX; + err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting max block dimension, use default\n"); + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { + DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; + DP("Using %d CUDA threads per block\n", maxBlockDimX); + } else { + DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" + "at the hard limit\n", + maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); + } + + int warpSize; + err = + cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); + if (err != CUDA_SUCCESS) { + DP("Error getting warp size, assume default\n"); + DeviceInfo.WarpSize[device_id] = 32; + } else { + DeviceInfo.WarpSize[device_id] = warpSize; + } + + // Adjust teams to the env variables + if (DeviceInfo.EnvTeamLimit > 0 && + DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { + DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; + DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", + DeviceInfo.EnvTeamLimit); + } + + DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", + DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], + DeviceInfo.WarpSize[device_id]); + + // Set default number of teams + if (DeviceInfo.EnvNumTeams > 0) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; + DP("Default number of teams set according to environment %d\n", + DeviceInfo.EnvNumTeams); + } else { + DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumTeams); + } + if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } + + // Set default number of threads + DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; + DP("Default number of threads set according to library's default %d\n", + RTLDeviceInfoTy::DefaultNumThreads); + if (DeviceInfo.NumThreads[device_id] > + DeviceInfo.ThreadsPerBlock[device_id]) { + DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + return OFFLOAD_SUCCESS; +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting a CUDA context for device %d\n", device_id); + CUDA_ERR_STRING(err); + return NULL; + } + + // Clear the offload table as we are going to create a new one. + DeviceInfo.clearOffloadEntriesTable(device_id); + + // Create the module and extract the function pointers. + + CUmodule cumod; + DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); + err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); + if (err != CUDA_SUCCESS) { + DP("Error when loading CUDA module\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("CUDA module successfully loaded!\n"); + DeviceInfo.Modules.push_back(cumod); + + // Find the symbols in the module by name. + __tgt_offload_entry *HostBegin = image->EntriesBegin; + __tgt_offload_entry *HostEnd = image->EntriesEnd; + + for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { + + if (!e->addr) { + // We return NULL when something like this happens, the host should have + // always something in the address to uniquely identify the target region. + DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); + + return NULL; + } + + if (e->size) { + __tgt_offload_entry entry = *e; + + CUdeviceptr cuptr; + size_t cusize; + err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading global '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + if (cusize != e->size) { + DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, + cusize, e->size); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); + entry.addr = (void *)cuptr; + + DeviceInfo.addOffloadEntry(device_id, entry); + + continue; + } + + CUfunction fun; + err = cuModuleGetFunction(&fun, cumod, e->name); + + if (err != CUDA_SUCCESS) { + DP("Loading '%s' (Failed)\n", e->name); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", + DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); + + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + std::string ExecModeNameStr (e->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + CUdeviceptr ExecModePtr; + size_t cusize; + err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); + if (err == CUDA_SUCCESS) { + if ((size_t)cusize != sizeof(int8_t)) { + DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", + ExecModeName, cusize, sizeof(int8_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); + CUDA_ERR_STRING(err); + return NULL; + } + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in cubin file: %d\n", + ExecModeVal); + return NULL; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default value " + "GENERIC (1)\n", ExecModeName); + CUDA_ERR_STRING(err); + } + + KernelsList.push_back(KernelTy(fun, ExecModeVal)); + + __tgt_offload_entry entry = *e; + entry.addr = (void *)&KernelsList.back(); + DeviceInfo.addOffloadEntry(device_id, entry); + } + + // send device environment data to the device + { + omptarget_device_environmentTy device_env; + + device_env.debug_level = 0; + +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { + device_env.debug_level = std::stoi(envStr); + } +#endif + + const char * device_env_Name="omptarget_device_environment"; + CUdeviceptr device_env_Ptr; + size_t cusize; + + err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); + + if (err == CUDA_SUCCESS) { + if ((size_t)cusize != sizeof(device_env)) { + DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", + device_env_Name, cusize, sizeof(int32_t)); + CUDA_ERR_STRING(err); + return NULL; + } + + err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", + DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); + CUDA_ERR_STRING(err); + return NULL; + } + + DP("Sending global device environment data %zu bytes\n", (size_t)cusize); + } else { + DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); + DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); + } + } + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { + if (size == 0) { + return NULL; + } + + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error while trying to set CUDA current context\n"); + CUDA_ERR_STRING(err); + return NULL; + } + + CUdeviceptr ptr; + err = cuMemAlloc(&ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error while trying to allocate %d\n", err); + CUDA_ERR_STRING(err); + return NULL; + } + + void *vptr = (void *)ptr; + return vptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); + if (err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), + DPxPTR(tgt_ptr), size); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + err = cuMemFree((CUdeviceptr)tgt_ptr); + if (err != CUDA_SUCCESS) { + DP("Error when freeing CUDA memory\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount) { + // Set the context we are using. + CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); + if (err != CUDA_SUCCESS) { + DP("Error when setting CUDA context\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + // All args are references. + std::vector<void *> args(arg_num); + std::vector<void *> ptrs(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } + + KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + + int cudaThreadsPerBlock; + + if (thread_limit > 0) { + cudaThreadsPerBlock = thread_limit; + DP("Setting CUDA threads per block to requested %d\n", thread_limit); + // Add master warp if necessary + if (KernelInfo->ExecutionMode == GENERIC) { + cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; + DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); + } + } else { + cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; + DP("Setting CUDA threads per block to default %d\n", + DeviceInfo.NumThreads[device_id]); + } + + if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { + cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; + DP("Threads per block capped at device limit %d\n", + DeviceInfo.ThreadsPerBlock[device_id]); + } + + int kernel_limit; + err = cuFuncGetAttribute(&kernel_limit, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); + if (err == CUDA_SUCCESS) { + if (kernel_limit < cudaThreadsPerBlock) { + cudaThreadsPerBlock = kernel_limit; + DP("Threads per block capped at kernel limit %d\n", kernel_limit); + } + } + + int cudaBlocksPerGrid; + if (team_num <= 0) { + if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { + if (KernelInfo->ExecutionMode == SPMD) { + // We have a combined construct, i.e. `target teams distribute parallel + // for [simd]`. We launch so many teams so that each thread will + // execute one iteration of the loop. + // round up to the nearest integer + cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + } else { + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + cudaBlocksPerGrid = loop_tripcount; + } + DP("Using %d teams due to loop trip count %" PRIu64 " and number of " + "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, + cudaThreadsPerBlock); + } else { + cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; + DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); + } + } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { + cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; + DP("Capping number of teams to team limit %d\n", + DeviceInfo.BlocksPerGrid[device_id]); + } else { + cudaBlocksPerGrid = team_num; + DP("Using requested number of teams %d\n", team_num); + } + + // Run on the device. + DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, + cudaThreadsPerBlock); + + err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, + cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); + if (err != CUDA_SUCCESS) { + DP("Device kernel launch failed!\n"); + CUDA_ERR_STRING(err); + return OFFLOAD_FAIL; + } + + DP("Launch of entry point at " DPxMOD " successful!\n", + DPxPTR(tgt_entry_ptr)); + + CUresult sync_err = cuCtxSynchronize(); + if (sync_err != CUDA_SUCCESS) { + DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); + CUDA_ERR_STRING(sync_err); + return OFFLOAD_FAIL; + } else { + DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); + } + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + // use one team and the default number of threads. + const int32_t team_num = 1; + const int32_t thread_limit = 0; + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, team_num, thread_limit, 0); +} + +#ifdef __cplusplus +} +#endif diff --git a/final/libomptarget/plugins/exports b/final/libomptarget/plugins/exports new file mode 100644 index 0000000..3f9f7d4 --- /dev/null +++ b/final/libomptarget/plugins/exports @@ -0,0 +1,15 @@ +VERS1.0 { + global: + __tgt_rtl_is_valid_binary; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_region; + local: + *; +}; diff --git a/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp new file mode 100644 index 0000000..951710a --- /dev/null +++ b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -0,0 +1,340 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include <cassert> +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <dlfcn.h> +#include <ffi.h> +#include <gelf.h> +#include <link.h> +#include <list> +#include <string> +#include <vector> + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME ".omp_offloading.entries" + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; + +public: + std::list<DynLibTy> DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + FuncGblEntries.resize(num_devices); + } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount /*not used*/) { + // ignore team num and thread limit. + + // Use libffi to launch execution. + ffi_cif cif; + + // All args are references. + std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer); + std::vector<void *> args(arg_num); + std::vector<void *> ptrs(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } + + ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &args_types[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + void (*entry)(void); + *((void**) &entry) = tgt_entry_ptr; + ffi_call(&cif, entry, NULL, &args[0]); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { + // use one team and one thread. + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +#ifdef __cplusplus +} +#endif diff --git a/final/libomptarget/plugins/ppc64/CMakeLists.txt b/final/libomptarget/plugins/ppc64/CMakeLists.txt new file mode 100644 index 0000000..6849a03 --- /dev/null +++ b/final/libomptarget/plugins/ppc64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file diff --git a/final/libomptarget/plugins/ppc64le/CMakeLists.txt b/final/libomptarget/plugins/ppc64le/CMakeLists.txt new file mode 100644 index 0000000..87cefdf --- /dev/null +++ b/final/libomptarget/plugins/ppc64le/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file diff --git a/final/libomptarget/plugins/x86_64/CMakeLists.txt b/final/libomptarget/plugins/x86_64/CMakeLists.txt new file mode 100644 index 0000000..bdd5bba --- /dev/null +++ b/final/libomptarget/plugins/x86_64/CMakeLists.txt @@ -0,0 +1,18 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") +endif()
\ No newline at end of file diff --git a/final/libomptarget/src/CMakeLists.txt b/final/libomptarget/src/CMakeLists.txt new file mode 100644 index 0000000..8e552d4 --- /dev/null +++ b/final/libomptarget/src/CMakeLists.txt @@ -0,0 +1,32 @@ +##===----------------------------------------------------------------------===## +# +# The LLVM Compiler Infrastructure +# +# This file is dual licensed under the MIT and the University of Illinois Open +# Source Licenses. See LICENSE.txt for details. +# +##===----------------------------------------------------------------------===## +# +# Build offloading library libomptarget.so. +# +##===----------------------------------------------------------------------===## + +libomptarget_say("Building offloading runtime library libomptarget.") + +set(src_files + api.cpp + device.cpp + interface.cpp + rtl.cpp + omptarget.cpp +) + +# Build libomptarget library with libdl dependency. +add_library(omptarget SHARED ${src_files}) +target_link_libraries(omptarget + ${CMAKE_DL_LIBS} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") + +# Install libomptarget under the lib destination folder. +install(TARGETS omptarget LIBRARY COMPONENT omptarget + DESTINATION "${OPENMP_INSTALL_LIBDIR}") diff --git a/final/libomptarget/src/api.cpp b/final/libomptarget/src/api.cpp new file mode 100644 index 0000000..15c1d2c --- /dev/null +++ b/final/libomptarget/src/api.cpp @@ -0,0 +1,283 @@ +//===----------- api.cpp - Target independent OpenMP target RTL -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of OpenMP API interface functions. +// +//===----------------------------------------------------------------------===// + +#include <omptarget.h> + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include <climits> +#include <cstring> +#include <cstdlib> + +EXTERN int omp_get_num_devices(void) { + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + + DP("Call to omp_get_num_devices returning %zd\n", Devices_size); + + return Devices_size; +} + +EXTERN int omp_get_initial_device(void) { + DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE); + return HOST_DEVICE; +} + +EXTERN void *omp_target_alloc(size_t size, int device_num) { + DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", + device_num, size); + + if (size <= 0) { + DP("Call to omp_target_alloc with non-positive length\n"); + return NULL; + } + + void *rc = NULL; + + if (device_num == omp_get_initial_device()) { + rc = malloc(size); + DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_alloc returns NULL ptr\n"); + return NULL; + } + + DeviceTy &Device = Devices[device_num]; + rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL); + DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; +} + +EXTERN void omp_target_free(void *device_ptr, int device_num) { + DP("Call to omp_target_free for device %d and address " DPxMOD "\n", + device_num, DPxPTR(device_ptr)); + + if (!device_ptr) { + DP("Call to omp_target_free with NULL ptr\n"); + return; + } + + if (device_num == omp_get_initial_device()) { + free(device_ptr); + DP("omp_target_free deallocated host ptr\n"); + return; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_free returns, nothing to do\n"); + return; + } + + DeviceTy &Device = Devices[device_num]; + Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr); + DP("omp_target_free deallocated device ptr\n"); +} + +EXTERN int omp_target_is_present(void *ptr, int device_num) { + DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", + device_num, DPxPTR(ptr)); + + if (!ptr) { + DP("Call to omp_target_is_present with NULL ptr, returning false\n"); + return false; + } + + if (device_num == omp_get_initial_device()) { + DP("Call to omp_target_is_present on host, returning true\n"); + return true; + } + + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Call to omp_target_is_present with invalid device ID, returning " + "false\n"); + return false; + } + + DeviceTy& Device = Devices[device_num]; + bool IsLast; // not used + int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL); + DP("Call to omp_target_is_present returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, int dst_device, int src_device) { + DP("Call to omp_target_memcpy, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " + "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), + DPxPTR(src), dst_offset, src_offset, length); + + if (!dst || !src || length <= 0) { + DP("Call to omp_target_memcpy with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + int rc = OFFLOAD_SUCCESS; + void *srcAddr = (char *)src + src_offset; + void *dstAddr = (char *)dst + dst_offset; + + if (src_device == omp_get_initial_device() && + dst_device == omp_get_initial_device()) { + DP("copy from host to host\n"); + const void *p = memcpy(dstAddr, srcAddr, length); + if (p == NULL) + rc = OFFLOAD_FAIL; + } else if (src_device == omp_get_initial_device()) { + DP("copy from host to device\n"); + DeviceTy& DstDev = Devices[dst_device]; + rc = DstDev.data_submit(dstAddr, srcAddr, length); + } else if (dst_device == omp_get_initial_device()) { + DP("copy from device to host\n"); + DeviceTy& SrcDev = Devices[src_device]; + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length); + } else { + DP("copy from device to device\n"); + void *buffer = malloc(length); + DeviceTy& SrcDev = Devices[src_device]; + DeviceTy& DstDev = Devices[dst_device]; + rc = SrcDev.data_retrieve(buffer, srcAddr, length); + if (rc == OFFLOAD_SUCCESS) + rc = DstDev.data_submit(dstAddr, buffer, length); + } + + DP("omp_target_memcpy returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device) { + DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " + "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " + "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device, + src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), + DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), + DPxPTR(volume), element_size, num_dims); + + if (!(dst || src)) { + DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", + INT_MAX); + return INT_MAX; + } + + if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || + !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { + DP("Call to omp_target_memcpy_rect with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + int rc; + if (num_dims == 1) { + rc = omp_target_memcpy(dst, src, element_size * volume[0], + element_size * dst_offsets[0], element_size * src_offsets[0], + dst_device, src_device); + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; i<num_dims; ++i) { + dst_slice_size *= dst_dimensions[i]; + src_slice_size *= src_dimensions[i]; + } + + size_t dst_off = dst_offsets[0] * dst_slice_size; + size_t src_off = src_offsets[0] * src_slice_size; + for (size_t i=0; i<volume[0]; ++i) { + rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i, + (char *) src + src_off + src_slice_size * i, element_size, + num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1, + dst_dimensions + 1, src_dimensions + 1, dst_device, src_device); + + if (rc) { + DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n"); + return rc; + } + } + } + + DP("omp_target_memcpy_rect returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr, + size_t size, size_t device_offset, int device_num) { + DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", " + "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n", + DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num); + + if (!host_ptr || !device_ptr || size <= 0) { + DP("Call to omp_target_associate_ptr with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (device_num == omp_get_initial_device()) { + DP("omp_target_associate_ptr: no association possible on the host\n"); + return OFFLOAD_FAIL; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + DeviceTy& Device = Devices[device_num]; + void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset); + int rc = Device.associatePtr(host_ptr, device_addr, size); + DP("omp_target_associate_ptr returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) { + DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", " + "device_num %d\n", DPxPTR(host_ptr), device_num); + + if (!host_ptr) { + DP("Call to omp_target_associate_ptr with invalid host_ptr\n"); + return OFFLOAD_FAIL; + } + + if (device_num == omp_get_initial_device()) { + DP("omp_target_disassociate_ptr: no association possible on the host\n"); + return OFFLOAD_FAIL; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + DeviceTy& Device = Devices[device_num]; + int rc = Device.disassociatePtr(host_ptr); + DP("omp_target_disassociate_ptr returns %d\n", rc); + return rc; +} diff --git a/final/libomptarget/src/device.cpp b/final/libomptarget/src/device.cpp new file mode 100644 index 0000000..bac6127 --- /dev/null +++ b/final/libomptarget/src/device.cpp @@ -0,0 +1,365 @@ +//===--------- device.cpp - Target independent OpenMP target RTL ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Functionality for managing devices that are handled by RTL plugins. +// +//===----------------------------------------------------------------------===// + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include <cassert> +#include <climits> +#include <string> + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy. +DevicesTy Devices; + +int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { + DataMapMtx.lock(); + + // Check if entry exists + for (auto &HT : HostDataToTargetMap) { + if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) { + // Mapping already exists + bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin && + HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size && + HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin; + DataMapMtx.unlock(); + if (isValid) { + DP("Attempt to re-associate the same device ptr+offset with the same " + "host ptr, nothing to do\n"); + return OFFLOAD_SUCCESS; + } else { + DP("Not allowed to re-associate a different device ptr+offset with the " + "same host ptr\n"); + return OFFLOAD_FAIL; + } + } + } + + // Mapping does not exist, allocate it + HostDataToTargetTy newEntry; + + // Set up missing fields + newEntry.HstPtrBase = (uintptr_t) HstPtrBegin; + newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin; + newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size; + newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin; + // refCount must be infinite + newEntry.RefCount = INF_REF_CNT; + + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd=" + DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase), + DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd), + DPxPTR(newEntry.TgtPtrBegin)); + HostDataToTargetMap.push_front(newEntry); + + DataMapMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +int DeviceTy::disassociatePtr(void *HstPtrBegin) { + DataMapMtx.lock(); + + // Check if entry exists + for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin(); + ii != HostDataToTargetMap.end(); ++ii) { + if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) { + // Mapping exists + if (CONSIDERED_INF(ii->RefCount)) { + DP("Association found, removing it\n"); + HostDataToTargetMap.erase(ii); + DataMapMtx.unlock(); + return OFFLOAD_SUCCESS; + } else { + DP("Trying to disassociate a pointer which was not mapped via " + "omp_target_associate_ptr\n"); + break; + } + } + } + + // Mapping not found + DataMapMtx.unlock(); + DP("Association not found\n"); + return OFFLOAD_FAIL; +} + +// Get ref count of map entry containing HstPtrBegin +long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + long RefCnt = -1; + + DataMapMtx.lock(); + for (auto &HT : HostDataToTargetMap) { + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) { + DP("DeviceTy::getMapEntry: requested entry found\n"); + RefCnt = HT.RefCount; + break; + } + } + DataMapMtx.unlock(); + + if (RefCnt < 0) { + DP("DeviceTy::getMapEntry: requested entry not found\n"); + } + + return RefCnt; +} + +LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr; + + DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp), + Size); + for (lr.Entry = HostDataToTargetMap.begin(); + lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) { + auto &HT = *lr.Entry; + // Is it contained? + lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd && + (hp+Size) <= HT.HstPtrEnd; + // Does it extend into an already mapped region? + lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin; + // Does it extend beyond the mapped region? + lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd; + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || + lr.Flags.ExtendsAfter) { + break; + } + } + + if (lr.Flags.ExtendsBefore) { + DP("WARNING: Pointer is not mapped but section extends into already " + "mapped data\n"); + } + if (lr.Flags.ExtendsAfter) { + DP("WARNING: Pointer is already mapped but section extends beyond mapped " + "region\n"); + } + + return lr; +} + +// Used by target_data_begin +// Return the target pointer begin (where the data will be moved). +// Allocate memory if this is the first occurrence if this mapping. +// Increment the reference counter. +// If NULL is returned, then either data allocation failed or the user tried +// to do an illegal mapping. +void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, + int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) { + void *rc = NULL; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + // Check if the pointer is contained. + if (lr.Flags.IsContained || + ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { + auto &HT = *lr.Entry; + IsNew = false; + + if (UpdateRefCount) + ++HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), + DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { + // Explicit extension of mapped data - not allowed. + DP("Explicit extension of mapping is not allowed.\n"); + } else if (Size) { + // If it is not contained and Size > 0 we should create a new entry for it. + IsNew = true; + uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " + "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), + DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); + HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, + (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); + rc = (void *)tp; + } + + DataMapMtx.unlock(); + return rc; +} + +// Used by target_data_begin, target_data_end, target_data_update and target. +// Return the target pointer begin (where the data will be moved). +// Decrement the reference counter if called from target_data_end. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount) { + void *rc = NULL; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + IsLast = !(HT.RefCount > 1); + + if (HT.RefCount > 1 && UpdateRefCount) + --HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else { + IsLast = false; + } + + DataMapMtx.unlock(); + return rc; +} + +// Return the target pointer begin (where the data will be moved). +// Lock-free version called when loading global symbols from the fat binary. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); + return (void *)tp; + } + + return NULL; +} + +int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) { + // Check if the pointer is contained in any sub-nodes. + int rc; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + if (ForceDelete) + HT.RefCount = 1; + if (--HT.RefCount <= 0) { + assert(HT.RefCount == 0 && "did not expect a negative ref count"); + DP("Deleting tgt data " DPxMOD " of size %ld\n", + DPxPTR(HT.TgtPtrBegin), Size); + RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin); + DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD + ", Size=%ld\n", (ForceDelete ? " (forced)" : ""), + DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size); + HostDataToTargetMap.erase(lr.Entry); + } + rc = OFFLOAD_SUCCESS; + } else { + DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated" + " memory\n", DPxPTR(HstPtrBegin)); + rc = OFFLOAD_FAIL; + } + + DataMapMtx.unlock(); + return rc; +} + +/// Init device, should not be called directly. +void DeviceTy::init() { + int32_t rc = RTL->init_device(RTLDeviceID); + if (rc == OFFLOAD_SUCCESS) { + IsInit = true; + } +} + +/// Thread-safe method to initialize the device only once. +int32_t DeviceTy::initOnce() { + std::call_once(InitFlag, &DeviceTy::init, this); + + // At this point, if IsInit is true, then either this thread or some other + // thread in the past successfully initialized the device, so we can return + // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it + // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means + // that some other thread already attempted to execute init() and if IsInit + // is still false, return OFFLOAD_FAIL. + if (IsInit) + return OFFLOAD_SUCCESS; + else + return OFFLOAD_FAIL; +} + +// Load binary to device. +__tgt_target_table *DeviceTy::load_binary(void *Img) { + RTL->Mtx.lock(); + __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img); + RTL->Mtx.unlock(); + return rc; +} + +// Submit data to device. +int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, + int64_t Size) { + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); +} + +// Retrieve data from device. +int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, + int64_t Size) { + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); +} + +// Run region on device +int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) { + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize); +} + +// Run team region on device. +int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount) { + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount); +} + +/// Check whether a device has an associated RTL and initialize it if it's not +/// already initialized. +bool device_is_ready(int device_num) { + DP("Checking whether device %d is ready.\n", device_num); + // Devices.size() can only change while registering a new + // library, so try to acquire the lock of RTLs' mutex. + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Device ID %d does not have a matching RTL\n", device_num); + return false; + } + + // Get device info + DeviceTy &Device = Devices[device_num]; + + DP("Is the device %d (local ID %d) initialized? %d\n", device_num, + Device.RTLDeviceID, Device.IsInit); + + // Init the device if not done before + if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) { + DP("Failed to init device %d\n", device_num); + return false; + } + + DP("Device %d is ready to use.\n", device_num); + + return true; +} diff --git a/final/libomptarget/src/device.h b/final/libomptarget/src/device.h new file mode 100644 index 0000000..3c205d6 --- /dev/null +++ b/final/libomptarget/src/device.h @@ -0,0 +1,167 @@ +//===----------- device.h - Target independent OpenMP target RTL ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Declarations for managing devices that are handled by RTL plugins. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_DEVICE_H +#define _OMPTARGET_DEVICE_H + +#include <cstddef> +#include <climits> +#include <list> +#include <map> +#include <mutex> +#include <vector> + +// Forward declarations. +struct RTLInfoTy; +struct __tgt_bin_desc; +struct __tgt_target_table; + +#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions +#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1)) + +/// Map between host data and target data. +struct HostDataToTargetTy { + uintptr_t HstPtrBase; // host info. + uintptr_t HstPtrBegin; + uintptr_t HstPtrEnd; // non-inclusive. + + uintptr_t TgtPtrBegin; // target info. + + long RefCount; + + HostDataToTargetTy() + : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0), + TgtPtrBegin(0), RefCount(0) {} + HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB) + : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), + TgtPtrBegin(TB), RefCount(1) {} + HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, + long RF) + : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), + TgtPtrBegin(TB), RefCount(RF) {} +}; + +typedef std::list<HostDataToTargetTy> HostDataToTargetListTy; + +struct LookupResult { + struct { + unsigned IsContained : 1; + unsigned ExtendsBefore : 1; + unsigned ExtendsAfter : 1; + } Flags; + + HostDataToTargetListTy::iterator Entry; + + LookupResult() : Flags({0,0,0}), Entry() {} +}; + +/// Map for shadow pointers +struct ShadowPtrValTy { + void *HstPtrVal; + void *TgtPtrAddr; + void *TgtPtrVal; +}; +typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy; + +/// +struct PendingCtorDtorListsTy { + std::list<void *> PendingCtors; + std::list<void *> PendingDtors; +}; +typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> + PendingCtorsDtorsPerLibrary; + +struct DeviceTy { + int32_t DeviceID; + RTLInfoTy *RTL; + int32_t RTLDeviceID; + + bool IsInit; + std::once_flag InitFlag; + bool HasPendingGlobals; + + HostDataToTargetListTy HostDataToTargetMap; + PendingCtorsDtorsPerLibrary PendingCtorsDtors; + + ShadowPtrListTy ShadowPtrMap; + + std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; + + uint64_t loopTripCnt; + + DeviceTy(RTLInfoTy *RTL) + : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), + HasPendingGlobals(false), HostDataToTargetMap(), + PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), + ShadowMtx(), loopTripCnt(0) {} + + // The existence of mutexes makes DeviceTy non-copyable. We need to + // provide a copy constructor and an assignment operator explicitly. + DeviceTy(const DeviceTy &d) + : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID), + IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals), + HostDataToTargetMap(d.HostDataToTargetMap), + PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap), + DataMapMtx(), PendingGlobalsMtx(), + ShadowMtx(), loopTripCnt(d.loopTripCnt) {} + + DeviceTy& operator=(const DeviceTy &d) { + DeviceID = d.DeviceID; + RTL = d.RTL; + RTLDeviceID = d.RTLDeviceID; + IsInit = d.IsInit; + HasPendingGlobals = d.HasPendingGlobals; + HostDataToTargetMap = d.HostDataToTargetMap; + PendingCtorsDtors = d.PendingCtorsDtors; + ShadowPtrMap = d.ShadowPtrMap; + loopTripCnt = d.loopTripCnt; + + return *this; + } + + long getMapEntryRefCnt(void *HstPtrBegin); + LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); + void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, + bool &IsNew, bool IsImplicit, bool UpdateRefCount = true); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount); + int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete); + int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + int disassociatePtr(void *HstPtrBegin); + + // calls to RTL + int32_t initOnce(); + __tgt_target_table *load_binary(void *Img); + + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + + int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize); + int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount); + +private: + // Call to RTL + void init(); // To be called only via DeviceTy::initOnce() +}; + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy. +typedef std::vector<DeviceTy> DevicesTy; +extern DevicesTy Devices; + +extern bool device_is_ready(int device_num); + +#endif diff --git a/final/libomptarget/src/exports b/final/libomptarget/src/exports new file mode 100644 index 0000000..8114751 --- /dev/null +++ b/final/libomptarget/src/exports @@ -0,0 +1,28 @@ +VERS1.0 { + global: + __tgt_register_lib; + __tgt_unregister_lib; + __tgt_target_data_begin; + __tgt_target_data_end; + __tgt_target_data_update; + __tgt_target; + __tgt_target_teams; + __tgt_target_data_begin_nowait; + __tgt_target_data_end_nowait; + __tgt_target_data_update_nowait; + __tgt_target_nowait; + __tgt_target_teams_nowait; + omp_get_num_devices; + omp_get_initial_device; + omp_target_alloc; + omp_target_free; + omp_target_is_present; + omp_target_memcpy; + omp_target_memcpy_rect; + omp_target_associate_ptr; + omp_target_disassociate_ptr; + __kmpc_push_target_tripcount; + local: + *; +}; + diff --git a/final/libomptarget/src/interface.cpp b/final/libomptarget/src/interface.cpp new file mode 100644 index 0000000..266e085 --- /dev/null +++ b/final/libomptarget/src/interface.cpp @@ -0,0 +1,319 @@ +//===-------- interface.cpp - Target independent OpenMP target RTL --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include <omptarget.h> + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include <cassert> +#include <cstdlib> +#include <mutex> + +// Store target policy (disabled, mandatory, default) +kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; +std::mutex TargetOffloadMtx; + +//////////////////////////////////////////////////////////////////////////////// +/// manage the success or failure of a target constuct + +static void HandleDefaultTargetOffload() { + TargetOffloadMtx.lock(); + if (TargetOffloadPolicy == tgt_default) { + if (omp_get_num_devices() > 0) { + DP("Default TARGET OFFLOAD policy is now mandatory " + "(devicew were found)\n"); + TargetOffloadPolicy = tgt_mandatory; + } else { + DP("Default TARGET OFFLOAD policy is now disabled " + "(devices were not found)\n"); + TargetOffloadPolicy = tgt_disabled; + } + } + TargetOffloadMtx.unlock(); +} + +static int IsOffloadDisabled() { + if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload(); + return TargetOffloadPolicy == tgt_disabled; +} + +static void HandleTargetOutcome(bool success) { + switch (TargetOffloadPolicy) { + case tgt_disabled: + if (success) { + FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); + } + break; + case tgt_default: + FATAL_MESSAGE0(1, "default offloading policy must switched to " + "mandatory or disabled"); + break; + case tgt_mandatory: + if (!success) { + FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); + } + break; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// adds a target shared library to the target execution image +EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { + RTLs.RegisterLib(desc); +} + +//////////////////////////////////////////////////////////////////////////////// +/// unloads a target shared library +EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { + RTLs.UnregisterLib(desc); +} + +/// creates host-to-target data mapping, stores it in the +/// libomptarget.so internal structure (an entry in a stack of data maps) +/// and passes the data to the device. +EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + DP("Use default device id %" PRId64 "\n", device_id); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy& Device = Devices[device_id]; + +#ifdef OMPTARGET_DEBUG + for (int i=0; i<arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]), + arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target_data_begin(Device, arg_num, args_base, + args, arg_sizes, arg_types); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// passes data from the target, releases target memory and destroys +/// the host-target mapping (top entry from the stack of data maps) +/// created by the last __tgt_target_data_begin. +EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + DP("Entering data end region with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + RTLsMtx.lock(); + size_t Devices_size = Devices.size(); + RTLsMtx.unlock(); + if (Devices_size <= (size_t)device_id) { + DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + if (!Device.IsInit) { + DP("Uninit device: ignore"); + HandleTargetOutcome(false); + return; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i<arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]), + arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target_data_end(Device, arg_num, args_base, + args, arg_sizes, arg_types); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + DP("Entering data update with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy& Device = Devices[device_id]; + int rc = target_data_update(Device, arg_num, args_base, + args, arg_sizes, arg_types); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +EXTERN void __tgt_target_data_update_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" + PRId64 "\n", DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i<arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]), + arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, 0, 0, false /*team*/); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + return rc; +} + +EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, int32_t thread_limit) { + if (IsOffloadDisabled()) return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" + PRId64 "\n", DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i<arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]), + arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types, team_num, thread_limit, true /*team*/); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); + + return rc; +} + +EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum, + void *depList, int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, 0); + + return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit); +} + + +// The trip count mechanism will be revised - this scheme is not thread-safe. +EXTERN void __kmpc_push_target_tripcount(int64_t device_id, + uint64_t loop_tripcount) { + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, + loop_tripcount); + Devices[device_id].loopTripCnt = loop_tripcount; +} diff --git a/final/libomptarget/src/omptarget.cpp b/final/libomptarget/src/omptarget.cpp new file mode 100644 index 0000000..a23d82b --- /dev/null +++ b/final/libomptarget/src/omptarget.cpp @@ -0,0 +1,771 @@ +//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include <omptarget.h> + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include <cassert> +#include <vector> + +#ifdef OMPTARGET_DEBUG +int DebugLevel = 0; +#endif // OMPTARGET_DEBUG + + + +/* All begin addresses for partially mapped structs must be 8-aligned in order + * to ensure proper alignment of members. E.g. + * + * struct S { + * int a; // 4-aligned + * int b; // 4-aligned + * int *p; // 8-aligned + * } s1; + * ... + * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) + * { + * s1.b = 5; + * for (int i...) s1.p[i] = ...; + * } + * + * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and + * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, + * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment + * requirements for its type. Now, when we allocate memory on the device, in + * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. + * This means that the chunk of the struct on the device will start at a + * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and + * address of p will be a misaligned 0x204 (on the host there was no need to add + * padding between b and p, so p comes exactly 4 bytes after b). If the device + * kernel tries to access s1.p, a misaligned address error occurs (as reported + * by the CUDA plugin). By padding the begin address down to a multiple of 8 and + * extending the size of the allocated chuck accordingly, the chuck on the + * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and + * &s1.p=0x208, as they should be to satisfy the alignment requirements. + */ +static const int64_t alignment = 8; + +/// Map global data and execute pending ctors +static int InitLibrary(DeviceTy& Device) { + /* + * Map global data + */ + int32_t device_id = Device.DeviceID; + int rc = OFFLOAD_SUCCESS; + + Device.PendingGlobalsMtx.lock(); + TrlTblMtx.lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable.begin(); + ii != HostEntriesBeginToTransTable.end(); ++ii) { + TranslationTable *TransTable = &ii->second; + if (TransTable->TargetsTable[device_id] != 0) { + // Library entries have already been processed + continue; + } + + // 1) get image. + assert(TransTable->TargetsImages.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_device_image *img = TransTable->TargetsImages[device_id]; + if (!img) { + DP("No image loaded for device id %d.\n", device_id); + rc = OFFLOAD_FAIL; + break; + } + // 2) load image into the target table. + __tgt_target_table *TargetTable = + TransTable->TargetsTable[device_id] = Device.load_binary(img); + // Unable to get table for this image: invalidate image and fail. + if (!TargetTable) { + DP("Unable to generate entries table for device id %d.\n", device_id); + TransTable->TargetsImages[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // Verify whether the two table sizes match. + size_t hsize = + TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; + size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; + + // Invalid image for these host entries! + if (hsize != tsize) { + DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", + device_id, hsize, tsize); + TransTable->TargetsImages[device_id] = 0; + TransTable->TargetsTable[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // process global data that needs to be mapped. + Device.DataMapMtx.lock(); + __tgt_target_table *HostTable = &TransTable->HostTable; + for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, + *CurrHostEntry = HostTable->EntriesBegin, + *EntryDeviceEnd = TargetTable->EntriesEnd; + CurrDeviceEntry != EntryDeviceEnd; + CurrDeviceEntry++, CurrHostEntry++) { + if (CurrDeviceEntry->size != 0) { + // has data. + assert(CurrDeviceEntry->size == CurrHostEntry->size && + "data size mismatch"); + + // Fortran may use multiple weak declarations for the same symbol, + // therefore we must allow for multiple weak symbols to be loaded from + // the fat binary. Treat these mappings as any other "regular" mapping. + // Add entry to map. + if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) + continue; + DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" + "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); + Device.HostDataToTargetMap.push_front(HostDataToTargetTy( + (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, + (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, + (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, + (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, + INF_REF_CNT /*RefCount*/)); + } + } + Device.DataMapMtx.unlock(); + } + TrlTblMtx.unlock(); + + if (rc != OFFLOAD_SUCCESS) { + Device.PendingGlobalsMtx.unlock(); + return rc; + } + + /* + * Run ctors for static objects + */ + if (!Device.PendingCtorsDtors.empty()) { + // Call all ctors for all libraries registered so far + for (auto &lib : Device.PendingCtorsDtors) { + if (!lib.second.PendingCtors.empty()) { + DP("Has pending ctors... call now\n"); + for (auto &entry : lib.second.PendingCtors) { + void *ctor = entry; + int rc = target(device_id, ctor, 0, NULL, NULL, NULL, + NULL, 1, 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); + Device.PendingGlobalsMtx.unlock(); + return OFFLOAD_FAIL; + } + } + // Clear the list to indicate that this device has been used + lib.second.PendingCtors.clear(); + DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); + } + } + } + Device.HasPendingGlobals = false; + Device.PendingGlobalsMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +// Check whether a device has been initialized, global ctors have been +// executed and global data has been mapped; do so if not already done. +int CheckDeviceAndCtors(int64_t device_id) { + // Is device ready? + if (!device_is_ready(device_id)) { + DP("Device %" PRId64 " is not ready.\n", device_id); + return OFFLOAD_FAIL; + } + + // Get device info. + DeviceTy &Device = Devices[device_id]; + + // Check whether global data has been mapped for this device + Device.PendingGlobalsMtx.lock(); + bool hasPendingGlobals = Device.HasPendingGlobals; + Device.PendingGlobalsMtx.unlock(); + if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { + DP("Failed to init globals on device %" PRId64 "\n", device_id); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +static int32_t member_of(int64_t type) { + return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; +} + +/// Internal function to do the mapping and transfer the data to the device +int target_data_begin(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + // process each input. + for (int32_t i = 0; i < arg_num; ++i) { + // Ignore private variables and arrays - there is no mapping for them. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + int64_t data_size = arg_sizes[i]; + + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + + // Address of pointer on the host and device, respectively. + void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; + bool IsNew, Pointer_IsNew; + bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we + // have reached this point via __tgt_target_data_begin and not __tgt_target + // then no argument is marked as TARGET_PARAM ("omp target data map" is not + // associated with a target region, so there are no target parameters). This + // may be considered a hack, we could revise the scheme in the future. + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + DP("Has a pointer entry: \n"); + // base is address of pointer. + Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, + sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef); + if (!Pointer_TgtPtrBegin) { + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + return OFFLOAD_FAIL; + } + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" + "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), + (Pointer_IsNew ? "" : " not")); + Pointer_HstPtrBegin = HstPtrBase; + // modify current entry. + HstPtrBase = *(void **)HstPtrBase; + UpdateRef = true; // subsequently update ref count of pointee + } + + void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, + data_size, IsNew, IsImplicit, UpdateRef); + if (!TgtPtrBegin && data_size) { + // If data_size==0, then the argument could be a zero-length pointer to + // NULL, so getOrAlloc() returning NULL is not an error. + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + } + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), + (IsNew ? "" : " not")); + + if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { + uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; + void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); + DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); + args_base[i] = TgtPtrBase; + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + bool copy = false; + if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { + copy = true; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + copy = true; + } + } + + if (copy) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); + int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, + sizeof(void *)); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + // create shadow pointers for this entry + Device.ShadowMtx.lock(); + Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, + Pointer_TgtPtrBegin, TgtPtrBase}; + Device.ShadowMtx.unlock(); + } + } + + return OFFLOAD_SUCCESS; +} + +/// Internal function to undo the mapping and retrieve the data from the device. +int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types) { + // process each input. + for (int32_t i = arg_num - 1; i >= 0; --i) { + // Ignore private variables and arrays - there is no mapping for them. + // Also, ignore the use_device_ptr directive, it has no effect here. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + int64_t data_size = arg_sizes[i]; + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + + bool IsLast; + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || + (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); + bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; + + // If PTR_AND_OBJ, HstPtrBegin is address of pointee + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, + UpdateRef); + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), + (IsLast ? "" : " not")); + + bool DelEntry = IsLast || ForceDelete; + + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + DelEntry = false; // protect parent struct from being deallocated + } + + if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { + // Move data back to the host + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; + bool CopyMember = false; + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + CopyMember = true; + } + } + + if (DelEntry || Always || CopyMember) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data from device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + + // If we copied back to the host a struct/array containing pointers, we + // need to restore the original host pointer values from their shadow + // copies. If the struct is going to be deallocated, remove any remaining + // shadow pointer entries for this struct. + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end();) { + void **ShadowHstPtrAddr = (void**) it->first; + + // An STL map is sorted on its keys; use this property + // to quickly determine when to break out of the loop. + if ((uintptr_t) ShadowHstPtrAddr < lb) { + ++it; + continue; + } + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + + // If we copied the struct to the host, we need to restore the pointer. + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + // If the struct is to be deallocated, remove the shadow entry. + if (DelEntry) { + DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); + it = Device.ShadowPtrMap.erase(it); + } else { + ++it; + } + } + Device.ShadowMtx.unlock(); + + // Deallocate map + if (DelEntry) { + int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocating data from device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + } + + return OFFLOAD_SUCCESS; +} + +/// Internal function to pass data to/from the target. +int target_data_update(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + // process each input. + for (int32_t i = 0; i < arg_num; ++i) { + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + int64_t MapSize = arg_sizes[i]; + bool IsLast; + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, + false); + if (!TgtPtrBegin) { + DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + continue; + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data from device failed.\n"); + return OFFLOAD_FAIL; + } + + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original host pointer value " DPxMOD " for host pointer " + DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + Device.ShadowMtx.unlock(); + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original target pointer value " DPxMOD " for target " + "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), + DPxPTR(it->second.TgtPtrAddr)); + rt = Device.data_submit(it->second.TgtPtrAddr, + &it->second.TgtPtrVal, sizeof(void *)); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + Device.ShadowMtx.unlock(); + return OFFLOAD_FAIL; + } + } + Device.ShadowMtx.unlock(); + } + } + return OFFLOAD_SUCCESS; +} + +static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | + OMP_TGT_MAPTYPE_LITERAL | + OMP_TGT_MAPTYPE_IMPLICIT; +static bool isLambdaMapping(int64_t Mapping) { + return (Mapping & LambdaMapping) == LambdaMapping; +} + +/// performs the same actions as data_begin in case arg_num is +/// non-zero and initiates run of the offloaded region on the target platform; +/// if arg_num is non-zero after the region execution is done it also +/// performs the same action as data_update and data_end above. This function +/// returns 0 if it was able to transfer the execution to a target and an +/// integer different from zero otherwise. +int target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { + DeviceTy &Device = Devices[device_id]; + + // Find the table information in the map or look it up in the translation + // tables. + TableMap *TM = 0; + TblMapMtx.lock(); + HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr); + if (TableMapIt == HostPtrToTableMap.end()) { + // We don't have a map. So search all the registered libraries. + TrlTblMtx.lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable.begin(), + ie = HostEntriesBeginToTransTable.end(); + !TM && ii != ie; ++ii) { + // get the translation table (which contains all the good info). + TranslationTable *TransTable = &ii->second; + // iterate over all the host table entries to see if we can locate the + // host_ptr. + __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; + __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; + __tgt_offload_entry *cur = begin; + for (uint32_t i = 0; cur < end; ++cur, ++i) { + if (cur->addr != host_ptr) + continue; + // we got a match, now fill the HostPtrToTableMap so that we + // may avoid this search next time. + TM = &HostPtrToTableMap[host_ptr]; + TM->Table = TransTable; + TM->Index = i; + break; + } + } + TrlTblMtx.unlock(); + } else { + TM = &TableMapIt->second; + } + TblMapMtx.unlock(); + + // No map for this host pointer found! + if (!TM) { + DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(host_ptr)); + return OFFLOAD_FAIL; + } + + // get target table. + TrlTblMtx.lock(); + assert(TM->Table->TargetsTable.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; + TrlTblMtx.unlock(); + assert(TargetTable && "Global data has not been mapped\n"); + + // Move data to device. + int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, + arg_types); + if (rc != OFFLOAD_SUCCESS) { + DP("Call to target_data_begin failed, abort target.\n"); + return OFFLOAD_FAIL; + } + + std::vector<void *> tgt_args; + std::vector<ptrdiff_t> tgt_offsets; + + // List of (first-)private arrays allocated for this target region + std::vector<void *> fpArrays; + std::vector<int> tgtArgsPositions(arg_num, -1); + + for (int32_t i = 0; i < arg_num; ++i) { + if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { + // This is not a target parameter, do not push it into tgt_args. + // Check for lambda mapping. + if (isLambdaMapping(arg_types[i])) { + assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + "PTR_AND_OBJ must be also MEMBER_OF."); + unsigned idx = member_of(arg_types[i]); + int tgtIdx = tgtArgsPositions[idx]; + assert(tgtIdx != -1 && "Base address must be translated already."); + // The parent lambda must be processed already and it must be the last + // in tgt_args and tgt_offsets arrays. + void *HstPtrVal = args[i]; + void *HstPtrBegin = args_base[i]; + void *HstPtrBase = args[idx]; + bool IsLast; // unused. + void *TgtPtrBase = + (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); + DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); + uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); + void *Pointer_TgtPtrBegin = + Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false); + if (!Pointer_TgtPtrBegin) { + DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", + DPxPTR(HstPtrVal)); + continue; + } + DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, + sizeof(void *)); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + } + continue; + } + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + void *TgtPtrBegin; + ptrdiff_t TgtBaseOffset; + bool IsLast; // unused. + if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { + DP("Forwarding first-private value " DPxMOD " to the target construct\n", + DPxPTR(HstPtrBase)); + TgtPtrBegin = HstPtrBase; + TgtBaseOffset = 0; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { + // Allocate memory for (first-)private array + TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, + arg_sizes[i], HstPtrBegin); + if (!TgtPtrBegin) { + DP ("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); + return OFFLOAD_FAIL; + } + fpArrays.push_back(TgtPtrBegin); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; +#ifdef OMPTARGET_DEBUG + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); +#endif + // If first-private, copy data from host + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + if (rt != OFFLOAD_SUCCESS) { + DP ("Copying data to device failed, failed.\n"); + return OFFLOAD_FAIL; + } + } + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, + false); + TgtBaseOffset = 0; // no offset for ptrs. + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " + "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), + DPxPTR(HstPtrBase)); + } else { + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + false); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; +#ifdef OMPTARGET_DEBUG + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); +#endif + } + tgtArgsPositions[i] = tgt_args.size(); + tgt_args.push_back(TgtPtrBegin); + tgt_offsets.push_back(TgtBaseOffset); + } + + assert(tgt_args.size() == tgt_offsets.size() && + "Size mismatch in arguments and offsets"); + + // Pop loop trip count + uint64_t ltc = Device.loopTripCnt; + Device.loopTripCnt = 0; + + // Launch device execution. + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, + DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); + if (IsTeamConstruct) { + rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, + thread_limit, ltc); + } else { + rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], &tgt_offsets[0], tgt_args.size()); + } + if (rc != OFFLOAD_SUCCESS) { + DP ("Executing target region abort target.\n"); + return OFFLOAD_FAIL; + } + + // Deallocate (first-)private arrays + for (auto it : fpArrays) { + int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocation of (first-)private arrays failed.\n"); + return OFFLOAD_FAIL; + } + } + + // Move data from device. + int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, + arg_types); + if (rt != OFFLOAD_SUCCESS) { + DP("Call to target_data_end failed, abort targe.\n"); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/final/libomptarget/src/private.h b/final/libomptarget/src/private.h new file mode 100644 index 0000000..3b61295 --- /dev/null +++ b/final/libomptarget/src/private.h @@ -0,0 +1,87 @@ +//===---------- private.h - Target independent OpenMP target RTL ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Private function declarations and helper macros for debugging output. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_PRIVATE_H +#define _OMPTARGET_PRIVATE_H + +#include <omptarget.h> + +#include <cstdint> + +extern int target_data_begin(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + +extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types); + +extern int target_data_update(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + +extern int target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct); + +extern int CheckDeviceAndCtors(int64_t device_id); + +// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition +enum kmp_target_offload_kind { + tgt_disabled = 0, + tgt_default = 1, + tgt_mandatory = 2 +}; +typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; +extern kmp_target_offload_kind_t TargetOffloadPolicy; + +//////////////////////////////////////////////////////////////////////////////// +// implemtation for fatal messages +//////////////////////////////////////////////////////////////////////////////// + +#define FATAL_MESSAGE0(_num, _str) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \ + exit(1); \ + } while (0) + +#define FATAL_MESSAGE(_num, _str, ...) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \ + __VA_ARGS__); \ + exit(1); \ + } while (0) + +// Implemented in libomp, they are called from within __tgt_* functions. +#ifdef __cplusplus +extern "C" { +#endif +// functions that extract info from libomp; keep in sync +int omp_get_default_device(void) __attribute__((weak)); +int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak)); +int __kmpc_get_target_offload(void) __attribute__((weak)); +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +extern int DebugLevel; + +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Libomptarget", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#endif diff --git a/final/libomptarget/src/rtl.cpp b/final/libomptarget/src/rtl.cpp new file mode 100644 index 0000000..fd46477 --- /dev/null +++ b/final/libomptarget/src/rtl.cpp @@ -0,0 +1,363 @@ +//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Functionality for handling RTL plugins. +// +//===----------------------------------------------------------------------===// + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include <cassert> +#include <cstdlib> +#include <cstring> +#include <dlfcn.h> +#include <mutex> +#include <string> + +// List of all plugins that can support offloading. +static const char *RTLNames[] = { + /* PowerPC target */ "libomptarget.rtl.ppc64.so", + /* x86_64 target */ "libomptarget.rtl.x86_64.so", + /* CUDA target */ "libomptarget.rtl.cuda.so", + /* AArch64 target */ "libomptarget.rtl.aarch64.so"}; + +RTLsTy RTLs; +std::mutex RTLsMtx; + +HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; +std::mutex TrlTblMtx; + +HostPtrToTableMapTy HostPtrToTableMap; +std::mutex TblMapMtx; + +void RTLsTy::LoadRTLs() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + // Parse environment variable OMP_TARGET_OFFLOAD (if set) + TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload(); + if (TargetOffloadPolicy == tgt_disabled) { + return; + } + + DP("Loading RTLs...\n"); + + // Attempt to open all the plugins and, if they exist, check if the interface + // is correct and if they are supporting any devices. + for (auto *Name : RTLNames) { + DP("Loading library '%s'...\n", Name); + void *dynlib_handle = dlopen(Name, RTLD_NOW); + + if (!dynlib_handle) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, dlerror()); + continue; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Retrieve the RTL information from the runtime library. + RTLInfoTy R; + + R.LibraryHandler = dynlib_handle; + R.isUsed = false; + +#ifdef OMPTARGET_DEBUG + R.RTLName = Name; +#endif + + if (!(*((void**) &R.is_valid_binary) = dlsym( + dynlib_handle, "__tgt_rtl_is_valid_binary"))) + continue; + if (!(*((void**) &R.number_of_devices) = dlsym( + dynlib_handle, "__tgt_rtl_number_of_devices"))) + continue; + if (!(*((void**) &R.init_device) = dlsym( + dynlib_handle, "__tgt_rtl_init_device"))) + continue; + if (!(*((void**) &R.load_binary) = dlsym( + dynlib_handle, "__tgt_rtl_load_binary"))) + continue; + if (!(*((void**) &R.data_alloc) = dlsym( + dynlib_handle, "__tgt_rtl_data_alloc"))) + continue; + if (!(*((void**) &R.data_submit) = dlsym( + dynlib_handle, "__tgt_rtl_data_submit"))) + continue; + if (!(*((void**) &R.data_retrieve) = dlsym( + dynlib_handle, "__tgt_rtl_data_retrieve"))) + continue; + if (!(*((void**) &R.data_delete) = dlsym( + dynlib_handle, "__tgt_rtl_data_delete"))) + continue; + if (!(*((void**) &R.run_region) = dlsym( + dynlib_handle, "__tgt_rtl_run_target_region"))) + continue; + if (!(*((void**) &R.run_team_region) = dlsym( + dynlib_handle, "__tgt_rtl_run_target_team_region"))) + continue; + + // No devices are supported by this RTL? + if (!(R.NumberOfDevices = R.number_of_devices())) { + DP("No devices supported in this RTL\n"); + continue; + } + + DP("Registering RTL %s supporting %d devices!\n", + R.RTLName.c_str(), R.NumberOfDevices); + + // The RTL is valid! Will save the information in the RTLs list. + AllRTLs.push_back(R); + } + + DP("RTLs loaded!\n"); + + return; +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering libs + +static void RegisterImageIntoTranslationTable(TranslationTable &TT, + RTLInfoTy &RTL, __tgt_device_image *image) { + + // same size, as when we increase one, we also increase the other. + assert(TT.TargetsTable.size() == TT.TargetsImages.size() && + "We should have as many images as we have tables!"); + + // Resize the Targets Table and Images to accommodate the new targets if + // required + unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; + + if (TT.TargetsTable.size() < TargetsTableMinimumSize) { + TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsTable.resize(TargetsTableMinimumSize, 0); + } + + // Register the image in all devices for this target type. + for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { + // If we are changing the image we are also invalidating the target table. + if (TT.TargetsImages[RTL.Idx + i] != image) { + TT.TargetsImages[RTL.Idx + i] = image; + TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering Ctors/Dtors + +static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, + __tgt_device_image *img, RTLInfoTy *RTL) { + + for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[RTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + Device.HasPendingGlobals = true; + for (__tgt_offload_entry *entry = img->EntriesBegin; + entry != img->EntriesEnd; ++entry) { + if (entry->flags & OMP_DECLARE_TARGET_CTOR) { + DP("Adding ctor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); + } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { + // Dtors are pushed in reverse order so they are executed from end + // to beginning when unregistering the library! + DP("Adding dtor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); + } + + if (entry->flags & OMP_DECLARE_TARGET_LINK) { + DP("The \"link\" attribute is not yet supported!\n"); + } + } + Device.PendingGlobalsMtx.unlock(); + } +} + +void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + // Attempt to load all plugins available in the system. + std::call_once(initFlag, &RTLsTy::LoadRTLs, this); + + RTLsMtx.lock(); + // Register the images with the RTLs that understand them, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. + for (auto &R : RTLs.AllRTLs) { + if (!R.is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + + // If this RTL is not already in use, initialize it. + if (!R.isUsed) { + // Initialize the device information for the RTL we are about to use. + DeviceTy device(&R); + size_t start = Devices.size(); + Devices.resize(start + R.NumberOfDevices, device); + for (int32_t device_id = 0; device_id < R.NumberOfDevices; + device_id++) { + // global device ID + Devices[start + device_id].DeviceID = start + device_id; + // RTL local device ID + Devices[start + device_id].RTLDeviceID = device_id; + } + + // Initialize the index of this RTL and save it in the used RTLs. + R.Idx = (RTLs.UsedRTLs.empty()) + ? 0 + : RTLs.UsedRTLs.back()->Idx + + RTLs.UsedRTLs.back()->NumberOfDevices; + assert((size_t) R.Idx == start && + "RTL index should equal the number of devices used so far."); + R.isUsed = true; + RTLs.UsedRTLs.push_back(&R); + + DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); + } + + // Initialize (if necessary) translation table for this library. + TrlTblMtx.lock(); + if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){ + TranslationTable &tt = + HostEntriesBeginToTransTable[desc->HostEntriesBegin]; + tt.HostTable.EntriesBegin = desc->HostEntriesBegin; + tt.HostTable.EntriesEnd = desc->HostEntriesEnd; + } + + // Retrieve translation table for this library. + TranslationTable &TransTable = + HostEntriesBeginToTransTable[desc->HostEntriesBegin]; + + DP("Registering image " DPxMOD " with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + RegisterImageIntoTranslationTable(TransTable, R, img); + TrlTblMtx.unlock(); + FoundRTL = &R; + + // Load ctors/dtors for static objects + RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); + + // if an RTL was found we are done - proceed to register the next image + break; + } + + if (!FoundRTL) { + DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); + } + } + RTLsMtx.unlock(); + + + DP("Done registering entries!\n"); +} + +void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) { + DP("Unloading target library!\n"); + + RTLsMtx.lock(); + // Find which RTL understands each image, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. We only need to scan RTLs that are already being used. + for (auto *R : RTLs.UsedRTLs) { + + assert(R->isUsed && "Expecting used RTLs."); + + if (!R->is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + FoundRTL = R; + + // Execute dtors for static objects if the device has been used, i.e. + // if its PendingCtors list has been emptied. + for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[FoundRTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { + int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1, + 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); + } + } + // Remove this library's entry from PendingCtorsDtors + Device.PendingCtorsDtors.erase(desc); + } + Device.PendingGlobalsMtx.unlock(); + } + + DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + break; + } + + // if no RTL was found proceed to unregister the next image + if (!FoundRTL){ + DP("No RTLs in use support the image " DPxMOD "!\n", + DPxPTR(img->ImageStart)); + } + } + RTLsMtx.unlock(); + DP("Done unregistering images!\n"); + + // Remove entries from HostPtrToTableMap + TblMapMtx.lock(); + for (__tgt_offload_entry *cur = desc->HostEntriesBegin; + cur < desc->HostEntriesEnd; ++cur) { + HostPtrToTableMap.erase(cur->addr); + } + + // Remove translation table for this descriptor. + auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin); + if (tt != HostEntriesBeginToTransTable.end()) { + DP("Removing translation table for descriptor " DPxMOD "\n", + DPxPTR(desc->HostEntriesBegin)); + HostEntriesBeginToTransTable.erase(tt); + } else { + DP("Translation table for descriptor " DPxMOD " cannot be found, probably " + "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin)); + } + + TblMapMtx.unlock(); + + // TODO: Remove RTL and the devices it manages if it's not used anymore? + // TODO: Write some RTL->unload_image(...) function? + + DP("Done unregistering library!\n"); +} diff --git a/final/libomptarget/src/rtl.h b/final/libomptarget/src/rtl.h new file mode 100644 index 0000000..2533e2c --- /dev/null +++ b/final/libomptarget/src/rtl.h @@ -0,0 +1,164 @@ +//===------------ rtl.h - Target independent OpenMP target RTL ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// Declarations for handling RTL plugins. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_RTL_H +#define _OMPTARGET_RTL_H + +#include <list> +#include <map> +#include <mutex> +#include <string> +#include <vector> + +// Forward declarations. +struct DeviceTy; +struct __tgt_bin_desc; + +struct RTLInfoTy { + typedef int32_t(is_valid_binary_ty)(void *); + typedef int32_t(number_of_devices_ty)(); + typedef int32_t(init_device_ty)(int32_t); + typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); + typedef void *(data_alloc_ty)(int32_t, int64_t, void *); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_delete_ty)(int32_t, void *); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t); + typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, int32_t, int32_t, uint64_t); + + int32_t Idx; // RTL index, index is the number of devices + // of other RTLs that were registered before, + // i.e. the OpenMP index of the first device + // to be registered with this RTL. + int32_t NumberOfDevices; // Number of devices this RTL deals with. + + void *LibraryHandler; + +#ifdef OMPTARGET_DEBUG + std::string RTLName; +#endif + + // Functions implemented in the RTL. + is_valid_binary_ty *is_valid_binary; + number_of_devices_ty *number_of_devices; + init_device_ty *init_device; + load_binary_ty *load_binary; + data_alloc_ty *data_alloc; + data_submit_ty *data_submit; + data_retrieve_ty *data_retrieve; + data_delete_ty *data_delete; + run_region_ty *run_region; + run_team_region_ty *run_team_region; + + // Are there images associated with this RTL. + bool isUsed; + + // Mutex for thread-safety when calling RTL interface functions. + // It is easier to enforce thread-safety at the libomptarget level, + // so that developers of new RTLs do not have to worry about it. + std::mutex Mtx; + + // The existence of the mutex above makes RTLInfoTy non-copyable. + // We need to provide a copy constructor explicitly. + RTLInfoTy() + : Idx(-1), NumberOfDevices(-1), LibraryHandler(0), +#ifdef OMPTARGET_DEBUG + RTLName(), +#endif + is_valid_binary(0), number_of_devices(0), init_device(0), + load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0), + data_delete(0), run_region(0), run_team_region(0), isUsed(false), + Mtx() {} + + RTLInfoTy(const RTLInfoTy &r) : Mtx() { + Idx = r.Idx; + NumberOfDevices = r.NumberOfDevices; + LibraryHandler = r.LibraryHandler; +#ifdef OMPTARGET_DEBUG + RTLName = r.RTLName; +#endif + is_valid_binary = r.is_valid_binary; + number_of_devices = r.number_of_devices; + init_device = r.init_device; + load_binary = r.load_binary; + data_alloc = r.data_alloc; + data_submit = r.data_submit; + data_retrieve = r.data_retrieve; + data_delete = r.data_delete; + run_region = r.run_region; + run_team_region = r.run_team_region; + isUsed = r.isUsed; + } +}; + +/// RTLs identified in the system. +class RTLsTy { +private: + // Mutex-like object to guarantee thread-safety and unique initialization + // (i.e. the library attempts to load the RTLs (plugins) only once). + std::once_flag initFlag; + void LoadRTLs(); // not thread-safe + +public: + // List of the detected runtime libraries. + std::list<RTLInfoTy> AllRTLs; + + // Array of pointers to the detected runtime libraries that have compatible + // binaries. + std::vector<RTLInfoTy *> UsedRTLs; + + explicit RTLsTy() {} + + // Register a shared library with all (compatible) RTLs. + void RegisterLib(__tgt_bin_desc *desc); + + // Unregister a shared library from all RTLs. + void UnregisterLib(__tgt_bin_desc *desc); +}; +extern RTLsTy RTLs; +extern std::mutex RTLsMtx; + + +/// Map between the host entry begin and the translation table. Each +/// registered library gets one TranslationTable. Use the map from +/// __tgt_offload_entry so that we may quickly determine whether we +/// are trying to (re)register an existing lib or really have a new one. +struct TranslationTable { + __tgt_target_table HostTable; + + // Image assigned to a given device. + std::vector<__tgt_device_image *> TargetsImages; // One image per device ID. + + // Table of entry points or NULL if it was not already computed. + std::vector<__tgt_target_table *> TargetsTable; // One table per device ID. +}; +typedef std::map<__tgt_offload_entry *, TranslationTable> + HostEntriesBeginToTransTableTy; +extern HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; +extern std::mutex TrlTblMtx; + +/// Map between the host ptr and a table index +struct TableMap { + TranslationTable *Table; // table associated with the host ptr. + uint32_t Index; // index in which the host ptr translated entry is found. + TableMap() : Table(0), Index(0) {} + TableMap(TranslationTable *table, uint32_t index) + : Table(table), Index(index) {} +}; +typedef std::map<void *, TableMap> HostPtrToTableMapTy; +extern HostPtrToTableMapTy HostPtrToTableMap; +extern std::mutex TblMapMtx; + +#endif diff --git a/final/libomptarget/test/CMakeLists.txt b/final/libomptarget/test/CMakeLists.txt new file mode 100644 index 0000000..5950c77 --- /dev/null +++ b/final/libomptarget/test/CMakeLists.txt @@ -0,0 +1,19 @@ +# CMakeLists.txt file for unit testing OpenMP offloading runtime library. +if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR + OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0) + libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() +endif() + +if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + set(LIBOMPTARGET_DEBUG True) +else() + set(LIBOMPTARGET_DEBUG False) +endif() + +add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp) + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) diff --git a/final/libomptarget/test/api/omp_get_num_devices.c b/final/libomptarget/test/api/omp_get_num_devices.c new file mode 100644 index 0000000..d0e84db --- /dev/null +++ b/final/libomptarget/test/api/omp_get_num_devices.c @@ -0,0 +1,36 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include <stdio.h> +#include <omp.h> + +int test_omp_get_num_devices() +{ + /* checks that omp_get_num_devices() > 0 */ + int num_devices = omp_get_num_devices(); + printf("num_devices = %d\n", num_devices); + + #pragma omp target + {} + + return (num_devices > 0); +} + +int main() +{ + int i; + int failed=0; + + if (!test_omp_get_num_devices()) { + failed++; + } + if (failed) + printf("FAIL\n"); + else + printf("PASS\n"); + return failed; +} + +// CHECK: PASS diff --git a/final/libomptarget/test/env/omp_target_debug.c b/final/libomptarget/test/env/omp_target_debug.c new file mode 100644 index 0000000..ce84c98 --- /dev/null +++ b/final/libomptarget/test/env/omp_target_debug.c @@ -0,0 +1,20 @@ +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG +// REQUIRES: libomptarget-debug + +int main(void) { +#pragma omp target + {} + return 0; +} + +// DEBUG: Libomptarget +// NDEBUG-NOT: Libomptarget +// NDEBUG-NOT: Target + diff --git a/final/libomptarget/test/lit.cfg b/final/libomptarget/test/lit.cfg new file mode 100644 index 0000000..4311605 --- /dev/null +++ b/final/libomptarget/test/lit.cfg @@ -0,0 +1,142 @@ +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def append_dynamic_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.libomptarget_obj_root + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_flags = " -I " + config.test_source_root + \ + " -I " + config.omp_header_directory + \ + " -L " + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_flags = config.test_flags + " -L " + \ + config.omp_host_rtl_directory + +config.test_flags = config.test_flags + " " + config.test_extra_flags + +# Allow REQUIRES / UNSUPPORTED / XFAIL to work +config.target_triple = [ ] +for feature in config.test_compiler_features: + config.available_features.add(feature) + +if config.libomptarget_debug: + config.available_features.add('libomptarget-debug') + +# Setup environment to find dynamic library at runtime +if config.operating_system == 'Windows': + append_dynamic_library_path('PATH', config.library_dir, ";") + append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";") +elif config.operating_system == 'Darwin': + append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('DYLD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ";") + config.test_flags += " -Wl,-rpath," + config.library_dir + config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory +else: # Unices + append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('LD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ":") + +# substitutions +# - for targets that exist in the system create the actual command. +# - for valid targets that do not exist in the system, return false, so that the +# same test can be used for different targets. + +# Scan all the valid targets. +for libomptarget_target in config.libomptarget_all_targets: + # Is this target in the current system? If so create a compile, run and test + # command. Otherwise create command that return false. + if libomptarget_target in config.libomptarget_system_targets: + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compile-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-" + libomptarget_target + " && " + \ + "%libomptarget-run-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "%libomptarget-compile-" + libomptarget_target + " && " + \ + "%libomptarget-run-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compilexx-" + \ + libomptarget_target, \ + "%clangxx-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-" + \ + libomptarget_target, \ + "%clang-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target)) + config.substitutions.append(("%libomptarget-run-" + \ + libomptarget_target, \ + "%t-" + libomptarget_target)) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + config.libomptarget_filecheck + " %s")) + else: + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + "echo ignored-command")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) +config.substitutions.append(("%flags", config.test_flags)) diff --git a/final/libomptarget/test/lit.site.cfg.in b/final/libomptarget/test/lit.site.cfg.in new file mode 100644 index 0000000..26ef492 --- /dev/null +++ b/final/libomptarget/test/lit.site.cfg.in @@ -0,0 +1,19 @@ +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" +config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" +config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@ +config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@" +config.test_extra_flags = "@OPENMP_TEST_FLAGS@" +config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.operating_system = "@CMAKE_SYSTEM_NAME@" +config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split() +config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split() +config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" +config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ + +# Let the main config do the real work. +lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/final/libomptarget/test/mapping/pr38704.c b/final/libomptarget/test/mapping/pr38704.c new file mode 100644 index 0000000..3e7135e --- /dev/null +++ b/final/libomptarget/test/mapping/pr38704.c @@ -0,0 +1,47 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// Clang 6.0 doesn't use the new map interface, undefined behavior when +// the compiler emits "old" interface code for structures. +// UNSUPPORTED: clang-6 + +#include <stdio.h> +#include <stdlib.h> + +typedef struct { + int *ptr1; + int *ptr2; +} StructWithPtrs; + +int main(int argc, char *argv[]) { + StructWithPtrs s, s2; + s.ptr1 = malloc(sizeof(int)); + s.ptr2 = malloc(2 * sizeof(int)); + s2.ptr1 = malloc(sizeof(int)); + s2.ptr2 = malloc(2 * sizeof(int)); + +#pragma omp target enter data map(to: s2.ptr2[0:1]) +#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2]) + { + s.ptr1[0] = 1; + s.ptr2[0] = 2; + s.ptr2[1] = 3; + } +#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1]) + + // CHECK: s.ptr1[0] = 1 + // CHECK: s.ptr2[0] = 2 + // CHECK: s.ptr2[1] = 3 + printf("s.ptr1[0] = %d\n", s.ptr1[0]); + printf("s.ptr2[0] = %d\n", s.ptr2[0]); + printf("s.ptr2[1] = %d\n", s.ptr2[1]); + + free(s.ptr1); + free(s.ptr2); + free(s2.ptr1); + free(s2.ptr2); + + return 0; +} diff --git a/final/libomptarget/test/offloading/offloading_success.c b/final/libomptarget/test/offloading/offloading_success.c new file mode 100644 index 0000000..12e78fa --- /dev/null +++ b/final/libomptarget/test/offloading/offloading_success.c @@ -0,0 +1,23 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include <stdio.h> +#include <omp.h> + +int main(void) { + int isHost = -1; + +#pragma omp target map(from: isHost) + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} diff --git a/final/libomptarget/test/offloading/offloading_success.cpp b/final/libomptarget/test/offloading/offloading_success.cpp new file mode 100644 index 0000000..eecd97a --- /dev/null +++ b/final/libomptarget/test/offloading/offloading_success.cpp @@ -0,0 +1,23 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include <stdio.h> +#include <omp.h> + +int main(void) { + int isHost = 0; + +#pragma omp target map(from: isHost) + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} |