10 files changed, 1385 insertions, 0 deletions
diff --git a/final/libomptarget/plugins/CMakeLists.txt b/final/libomptarget/plugins/CMakeLists.txt
new file mode 100644
index 0000000..8c3d571
--- /dev/null
+++ b/final/libomptarget/plugins/CMakeLists.txt
@@ -0,0 +1,72 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# - build a plugin for an ELF based generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+  if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+    
+      libomptarget_say("Building ${tmachine_name} offloading plugin.")
+    
+      include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+      include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+      
+      # Define macro to be used as prefix of the runtime messages for this target.
+      add_definitions("-DTARGET_NAME=${tmachine_name}")
+      
+      # Define macro with the ELF ID for this target.
+      add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+    
+      add_library("omptarget.rtl.${tmachine_libname}" SHARED 
+        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
+        
+      # Install plugin under the lib destination folder.
+      install(TARGETS "omptarget.rtl.${tmachine_libname}" 
+        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        
+      target_link_libraries(
+        "omptarget.rtl.${tmachine_libname}"
+        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
+        ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+        dl
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+    
+      # Report to the parent scope that we are building a plugin.
+      set(LIBOMPTARGET_SYSTEM_TARGETS 
+        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+      
+    else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+      libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+    endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+  endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+else()
+  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(aarch64)
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+
diff --git a/final/libomptarget/plugins/aarch64/CMakeLists.txt b/final/libomptarget/plugins/aarch64/CMakeLists.txt
new file mode 100644
index 0000000..e3a76b9
--- /dev/null
+++ b/final/libomptarget/plugins/aarch64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
+endif()
diff --git a/final/libomptarget/plugins/common/elf_common.c b/final/libomptarget/plugins/common/elf_common.c
new file mode 100644
index 0000000..dd85575
--- /dev/null
+++ b/final/libomptarget/plugins/common/elf_common.c
@@ -0,0 +1,73 @@
+//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common ELF functionality for target plugins.
+// Must be included in the plugin source file AFTER omptarget.h has been
+// included and macro DP(...) has been defined.
+// .
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(_OMPTARGET_H_) && defined(DP))
+#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
+ included and macro DP(...) has been defined.
+#endif
+
+#include <elf.h>
+#include <libelf.h>
+
+// Check whether an image is valid for execution on target_id
+static inline int32_t elf_check_machine(__tgt_device_image *image,
+    uint16_t target_id) {
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return 0;
+  }
+
+  char *img_begin = (char *)image->ImageStart;
+  char *img_end = (char *)image->ImageEnd;
+  size_t img_size = img_end - img_begin;
+
+  // Obtain elf handler
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  // Check if ELF is the right kind.
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Unexpected ELF type!\n");
+    return 0;
+  }
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  Elf32_Ehdr *eh32 = elf32_getehdr(e);
+
+  if (!eh64 && !eh32) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint16_t MachineID;
+  if (eh64 && !eh32)
+    MachineID = eh64->e_machine;
+  else if (eh32 && !eh64)
+    MachineID = eh32->e_machine;
+  else {
+    DP("Ambiguous ELF header!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  elf_end(e);
+  return MachineID == target_id;
+}
diff --git a/final/libomptarget/plugins/cuda/CMakeLists.txt b/final/libomptarget/plugins/cuda/CMakeLists.txt
new file mode 100644
index 0000000..7210eec
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/CMakeLists.txt
@@ -0,0 +1,50 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.")
+  return()
+elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
+  return()
+endif()
+
+libomptarget_say("Building CUDA offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=CUDA)
+
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+    add_definitions(-DCUDA_ERROR_REPORT)
+endif()
+
+include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+
+add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+target_link_libraries(omptarget.rtl.cuda
+  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+  ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+
+# Report to the parent scope that we are building a plugin for CUDA.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp
new file mode 100644
index 0000000..d265a87
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/src/rtl.cpp
@@ -0,0 +1,763 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME CUDA
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+// Utility for retrieving and printing CUDA error string.
+#ifdef CUDA_ERROR_REPORT
+#define CUDA_ERR_STRING(err)                                                   \
+  do {                                                                         \
+    const char *errStr;                                                        \
+    cuGetErrorString(err, &errStr);                                            \
+    DP("CUDA error is: %s\n", errStr);                                         \
+  } while (0)
+#else
+#define CUDA_ERR_STRING(err)                                                   \
+  {}
+#endif
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+  SPMD, // constructors, destructors,
+        // combined constructs (`teams distribute parallel for [simd]`)
+  GENERIC, // everything else
+  NONE
+};
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+  CUfunction Func;
+
+  // execution mode of kernel
+  // 0 - SPMD mode (without master warp)
+  // 1 - Generic mode (with master warp)
+  int8_t ExecutionMode;
+
+  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+};
+
+/// Device envrionment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  int NumberOfDevices;
+  std::vector<CUmodule> Modules;
+  std::vector<CUcontext> Contexts;
+
+  // Device properties
+  std::vector<int> ThreadsPerBlock;
+  std::vector<int> BlocksPerGrid;
+  std::vector<int> WarpSize;
+
+  // OpenMP properties
+  std::vector<int> NumTeams;
+  std::vector<int> NumThreads;
+
+  // OpenMP Environment properties
+  int EnvNumTeams;
+  int EnvTeamLimit;
+
+  //static int EnvNumThreads;
+  static const int HardTeamLimit = 1<<16; // 64k
+  static const int HardThreadLimit = 1024;
+  static const int DefaultNumTeams = 128;
+  static const int DefaultNumThreads = 128;
+
+  // Record entry point associated with device
+  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Entries.push_back(entry);
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (auto &it : E.Entries) {
+      if (it.addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    int32_t size = E.Entries.size();
+
+    // Table is empty
+    if (!size)
+      return 0;
+
+    __tgt_offload_entry *begin = &E.Entries[0];
+    __tgt_offload_entry *end = &E.Entries[size - 1];
+
+    // Update table info according to the entries and return the pointer
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = ++end;
+
+    return &E.Table;
+  }
+
+  // Clear entries table for a device
+  void clearOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+    E.Entries.clear();
+    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+  }
+
+  RTLDeviceInfoTy() {
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+      DebugLevel = std::stoi(envStr);
+    }
+#endif // OMPTARGET_DEBUG
+
+    DP("Start initializing CUDA\n");
+
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS) {
+      DP("Error when initializing CUDA\n");
+      CUDA_ERR_STRING(err);
+      return;
+    }
+
+    NumberOfDevices = 0;
+
+    err = cuDeviceGetCount(&NumberOfDevices);
+    if (err != CUDA_SUCCESS) {
+      DP("Error when getting CUDA device count\n");
+      CUDA_ERR_STRING(err);
+      return;
+    }
+
+    if (NumberOfDevices == 0) {
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    FuncGblEntries.resize(NumberOfDevices);
+    Contexts.resize(NumberOfDevices);
+    ThreadsPerBlock.resize(NumberOfDevices);
+    BlocksPerGrid.resize(NumberOfDevices);
+    WarpSize.resize(NumberOfDevices);
+    NumTeams.resize(NumberOfDevices);
+    NumThreads.resize(NumberOfDevices);
+
+    // Get environment variables regarding teams
+    char *envStr = getenv("OMP_TEAM_LIMIT");
+    if (envStr) {
+      // OMP_TEAM_LIMIT has been set
+      EnvTeamLimit = std::stoi(envStr);
+      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+    } else {
+      EnvTeamLimit = -1;
+    }
+    envStr = getenv("OMP_NUM_TEAMS");
+    if (envStr) {
+      // OMP_NUM_TEAMS has been set
+      EnvNumTeams = std::stoi(envStr);
+      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+    } else {
+      EnvNumTeams = -1;
+    }
+  }
+
+  ~RTLDeviceInfoTy() {
+    // Close modules
+    for (auto &module : Modules)
+      if (module) {
+        CUresult err = cuModuleUnload(module);
+        if (err != CUDA_SUCCESS) {
+          DP("Error when unloading CUDA module\n");
+          CUDA_ERR_STRING(err);
+        }
+      }
+
+    // Destroy contexts
+    for (auto &ctx : Contexts)
+      if (ctx) {
+        CUresult err = cuCtxDestroy(ctx);
+        if (err != CUDA_SUCCESS) {
+          DP("Error when destroying CUDA context\n");
+          CUDA_ERR_STRING(err);
+        }
+      }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_check_machine(image, 190); // EM_CUDA = 190.
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+
+  CUdevice cuDevice;
+  DP("Getting device %d\n", device_id);
+  CUresult err = cuDeviceGet(&cuDevice, device_id);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when getting CUDA device with id = %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // Create the context and save it to use whenever this device is selected.
+  err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+                    cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when creating a CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // Query attributes to determine number of threads/block and blocks/grid.
+  int maxGridDimX;
+  err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting max grid dimension, use default\n");
+    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+  } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
+    DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
+    DP("Using %d CUDA blocks per grid\n", maxGridDimX);
+  } else {
+    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+    DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+       "at the hard limit\n",
+       maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
+  }
+
+  // We are only exploiting threads along the x axis.
+  int maxBlockDimX;
+  err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting max block dimension, use default\n");
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+  } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
+    DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
+    DP("Using %d CUDA threads per block\n", maxBlockDimX);
+  } else {
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+    DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
+       "at the hard limit\n",
+       maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
+  }
+
+  int warpSize;
+  err =
+      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting warp size, assume default\n");
+    DeviceInfo.WarpSize[device_id] = 32;
+  } else {
+    DeviceInfo.WarpSize[device_id] = warpSize;
+  }
+
+  // Adjust teams to the env variables
+  if (DeviceInfo.EnvTeamLimit > 0 &&
+      DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
+    DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
+    DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+        DeviceInfo.EnvTeamLimit);
+  }
+
+  DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+     DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
+     DeviceInfo.WarpSize[device_id]);
+
+  // Set default number of teams
+  if (DeviceInfo.EnvNumTeams > 0) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+    DP("Default number of teams set according to environment %d\n",
+        DeviceInfo.EnvNumTeams);
+  } else {
+    DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+    DP("Default number of teams set according to library's default %d\n",
+        RTLDeviceInfoTy::DefaultNumTeams);
+  }
+  if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
+    DP("Default number of teams exceeds device limit, capping at %d\n",
+        DeviceInfo.BlocksPerGrid[device_id]);
+  }
+
+  // Set default number of threads
+  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+  DP("Default number of threads set according to library's default %d\n",
+          RTLDeviceInfoTy::DefaultNumThreads);
+  if (DeviceInfo.NumThreads[device_id] >
+      DeviceInfo.ThreadsPerBlock[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
+    DP("Default number of threads exceeds device limit, capping at %d\n",
+        DeviceInfo.ThreadsPerBlock[device_id]);
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+    __tgt_device_image *image) {
+
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting a CUDA context for device %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  // Clear the offload table as we are going to create a new one.
+  DeviceInfo.clearOffloadEntriesTable(device_id);
+
+  // Create the module and extract the function pointers.
+
+  CUmodule cumod;
+  DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
+  err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when loading CUDA module\n");
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  DP("CUDA module successfully loaded!\n");
+  DeviceInfo.Modules.push_back(cumod);
+
+  // Find the symbols in the module by name.
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+    if (!e->addr) {
+      // We return NULL when something like this happens, the host should have
+      // always something in the address to uniquely identify the target region.
+      DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+
+      return NULL;
+    }
+
+    if (e->size) {
+      __tgt_offload_entry entry = *e;
+
+      CUdeviceptr cuptr;
+      size_t cusize;
+      err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+
+      if (err != CUDA_SUCCESS) {
+        DP("Loading global '%s' (Failed)\n", e->name);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      if (cusize != e->size) {
+        DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
+            cusize, e->size);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+          DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
+      entry.addr = (void *)cuptr;
+
+      DeviceInfo.addOffloadEntry(device_id, entry);
+
+      continue;
+    }
+
+    CUfunction fun;
+    err = cuModuleGetFunction(&fun, cumod, e->name);
+
+    if (err != CUDA_SUCCESS) {
+      DP("Loading '%s' (Failed)\n", e->name);
+      CUDA_ERR_STRING(err);
+      return NULL;
+    }
+
+    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+        DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
+
+    // default value GENERIC (in case symbol is missing from cubin file)
+    int8_t ExecModeVal = ExecutionModeType::GENERIC;
+    std::string ExecModeNameStr (e->name);
+    ExecModeNameStr += "_exec_mode";
+    const char *ExecModeName = ExecModeNameStr.c_str();
+
+    CUdeviceptr ExecModePtr;
+    size_t cusize;
+    err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
+    if (err == CUDA_SUCCESS) {
+      if ((size_t)cusize != sizeof(int8_t)) {
+        DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+           ExecModeName, cusize, sizeof(int8_t));
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
+      if (err != CUDA_SUCCESS) {
+        DP("Error when copying data from device to host. Pointers: "
+           "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+           DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      if (ExecModeVal < 0 || ExecModeVal > 1) {
+        DP("Error wrong exec_mode value specified in cubin file: %d\n",
+           ExecModeVal);
+        return NULL;
+      }
+    } else {
+      DP("Loading global exec_mode '%s' - symbol missing, using default value "
+          "GENERIC (1)\n", ExecModeName);
+      CUDA_ERR_STRING(err);
+    }
+
+    KernelsList.push_back(KernelTy(fun, ExecModeVal));
+
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void *)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+  }
+
+  // send device environment data to the device
+  {
+    omptarget_device_environmentTy device_env;
+
+    device_env.debug_level = 0;
+
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+      device_env.debug_level = std::stoi(envStr);
+    }
+#endif
+
+    const char * device_env_Name="omptarget_device_environment";
+    CUdeviceptr device_env_Ptr;
+    size_t cusize;
+
+    err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+
+    if (err == CUDA_SUCCESS) {
+      if ((size_t)cusize != sizeof(device_env)) {
+        DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+            device_env_Name, cusize, sizeof(int32_t));
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
+      if (err != CUDA_SUCCESS) {
+        DP("Error when copying data from host to device. Pointers: "
+            "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+            DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
+    } else {
+      DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
+      DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
+    }
+  }
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+  if (size == 0) {
+    return NULL;
+  }
+
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error while trying to set CUDA current context\n");
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  CUdeviceptr ptr;
+  err = cuMemAlloc(&ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error while trying to allocate %d\n", err);
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  void *vptr = (void *)ptr;
+  return vptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+    int64_t size) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+       ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+       DPxPTR(tgt_ptr), size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+    int64_t size) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+        DPxPTR(tgt_ptr), size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemFree((CUdeviceptr)tgt_ptr);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when freeing CUDA memory\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // All args are references.
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+  }
+
+  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+  int cudaThreadsPerBlock;
+
+  if (thread_limit > 0) {
+    cudaThreadsPerBlock = thread_limit;
+    DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+    // Add master warp if necessary
+    if (KernelInfo->ExecutionMode == GENERIC) {
+      cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+      DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+    }
+  } else {
+    cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
+    DP("Setting CUDA threads per block to default %d\n",
+        DeviceInfo.NumThreads[device_id]);
+  }
+
+  if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
+    cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
+    DP("Threads per block capped at device limit %d\n",
+        DeviceInfo.ThreadsPerBlock[device_id]);
+  }
+
+  int kernel_limit;
+  err = cuFuncGetAttribute(&kernel_limit,
+      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
+  if (err == CUDA_SUCCESS) {
+    if (kernel_limit < cudaThreadsPerBlock) {
+      cudaThreadsPerBlock = kernel_limit;
+      DP("Threads per block capped at kernel limit %d\n", kernel_limit);
+    }
+  }
+
+  int cudaBlocksPerGrid;
+  if (team_num <= 0) {
+    if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
+      if (KernelInfo->ExecutionMode == SPMD) {
+        // We have a combined construct, i.e. `target teams distribute parallel
+        // for [simd]`. We launch so many teams so that each thread will
+        // execute one iteration of the loop.
+        // round up to the nearest integer
+        cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+      } else {
+        // If we reach this point, then we have a non-combined construct, i.e.
+        // `teams distribute` with a nested `parallel for` and each team is
+        // assigned one iteration of the `distribute` loop. E.g.:
+        //
+        // #pragma omp target teams distribute
+        // for(...loop_tripcount...) {
+        //   #pragma omp parallel for
+        //   for(...) {}
+        // }
+        //
+        // Threads within a team will execute the iterations of the `parallel`
+        // loop.
+        cudaBlocksPerGrid = loop_tripcount;
+      }
+      DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+          "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
+          cudaThreadsPerBlock);
+    } else {
+      cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
+      DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
+    }
+  } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
+    cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
+    DP("Capping number of teams to team limit %d\n",
+        DeviceInfo.BlocksPerGrid[device_id]);
+  } else {
+    cudaBlocksPerGrid = team_num;
+    DP("Using requested number of teams %d\n", team_num);
+  }
+
+  // Run on the device.
+  DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
+     cudaThreadsPerBlock);
+
+  err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
+      cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
+  if (err != CUDA_SUCCESS) {
+    DP("Device kernel launch failed!\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  DP("Launch of entry point at " DPxMOD " successful!\n",
+      DPxPTR(tgt_entry_ptr));
+
+  CUresult sync_err = cuCtxSynchronize();
+  if (sync_err != CUDA_SUCCESS) {
+    DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
+    CUDA_ERR_STRING(sync_err);
+    return OFFLOAD_FAIL;
+  } else {
+    DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+  // use one team and the default number of threads.
+  const int32_t team_num = 1;
+  const int32_t thread_limit = 0;
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+      tgt_offsets, arg_num, team_num, thread_limit, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/exports b/final/libomptarget/plugins/exports
new file mode 100644
index 0000000..3f9f7d4
--- /dev/null
+++ b/final/libomptarget/plugins/exports
@@ -0,0 +1,15 @@
+VERS1.0 {
+  global:
+    __tgt_rtl_is_valid_binary;
+    __tgt_rtl_number_of_devices;
+    __tgt_rtl_init_device;
+    __tgt_rtl_load_binary;
+    __tgt_rtl_data_alloc;
+    __tgt_rtl_data_submit;
+    __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_delete;
+    __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_region;
+  local:
+    *;
+};
diff --git a/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
new file mode 100644
index 0000000..951710a
--- /dev/null
+++ b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -0,0 +1,340 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+#define NUMBER_OF_DEVICES 4
+#define OFFLOADSECTIONNAME ".omp_offloading.entries"
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+  char *FileName;
+  void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+};
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  std::list<DynLibTy> DynLibs;
+
+  // Record entry point associated with device.
+  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+                          __tgt_offload_entry *end) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = end;
+  }
+
+  // Return true if the entry is associated with device.
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+         i < e; ++i) {
+      if (i->addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table.
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    return &E.Table;
+  }
+
+  RTLDeviceInfoTy(int32_t num_devices) {
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+      DebugLevel = std::stoi(envStr);
+    }
+#endif // OMPTARGET_DEBUG
+
+    FuncGblEntries.resize(num_devices);
+  }
+
+  ~RTLDeviceInfoTy() {
+    // Close dynamic libraries
+    for (auto &lib : DynLibs) {
+      if (lib.Handle) {
+        dlclose(lib.Handle);
+        remove(lib.FileName);
+      }
+    }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+  return 0;
+#else
+  return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+
+  DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+     DPxPTR(image->ImageStart));
+
+  assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+  size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+  DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return NULL;
+  }
+
+  // Obtain elf handler
+  Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return NULL;
+  }
+
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Invalid Elf kind!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  // Find the entries section offset
+  Elf_Scn *section = 0;
+  Elf64_Off entries_offset = 0;
+
+  size_t shstrndx;
+
+  if (elf_getshdrstrndx(e, &shstrndx)) {
+    DP("Unable to get ELF strings index!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  while ((section = elf_nextscn(e, section))) {
+    GElf_Shdr hdr;
+    gelf_getshdr(section, &hdr);
+
+    if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+      entries_offset = hdr.sh_addr;
+      break;
+    }
+  }
+
+  if (!entries_offset) {
+    DP("Entries Section Offset Not Found\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid the
+  // dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents.
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+
+  if (tmp_fd == -1) {
+    elf_end(e);
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if (!ftmp) {
+    elf_end(e);
+    return NULL;
+  }
+
+  fwrite(image->ImageStart, ImageSize, 1, ftmp);
+  fclose(ftmp);
+
+  DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)};
+
+  if (!Lib.Handle) {
+    DP("Target library loading error: %s\n", dlerror());
+    elf_end(e);
+    return NULL;
+  }
+
+  DeviceInfo.DynLibs.push_back(Lib);
+
+  struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+  // The place where the entries info is loaded is the library base address
+  // plus the offset determined from the ELF file.
+  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+  DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+      DPxPTR(entries_addr));
+
+  // Table of pointers to all the entries in the target.
+  __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+  __tgt_offload_entry *entries_begin = &entries_table[0];
+  __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+  if (!entries_begin) {
+    DP("Can't obtain entries begin\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+      DPxPTR(entries_begin), DPxPTR(entries_end));
+  DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+  elf_end(e);
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+  void *ptr = malloc(size);
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  memcpy(tgt_ptr, hst_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  memcpy(hst_ptr, tgt_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  free(tgt_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+  // ignore team num and thread limit.
+
+  // Use libffi to launch execution.
+  ffi_cif cif;
+
+  // All args are references.
+  std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+  }
+
+  ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
+                                   &ffi_type_void, &args_types[0]);
+
+  assert(status == FFI_OK && "Unable to prepare target launch!");
+
+  if (status != FFI_OK)
+    return OFFLOAD_FAIL;
+
+  DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+  void (*entry)(void);
+  *((void**) &entry) = tgt_entry_ptr;
+  ffi_call(&cif, entry, NULL, &args[0]);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+  // use one team and one thread.
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+      tgt_offsets, arg_num, 1, 1, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/ppc64/CMakeLists.txt b/final/libomptarget/plugins/ppc64/CMakeLists.txt
new file mode 100644
index 0000000..6849a03
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
+endif()
+\ No newline at end of file
diff --git a/final/libomptarget/plugins/ppc64le/CMakeLists.txt b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
new file mode 100644
index 0000000..87cefdf
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
+endif()
+\ No newline at end of file
diff --git a/final/libomptarget/plugins/x86_64/CMakeLists.txt b/final/libomptarget/plugins/x86_64/CMakeLists.txt
new file mode 100644
index 0000000..bdd5bba
--- /dev/null
+++ b/final/libomptarget/plugins/x86_64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
+endif()
+\ No newline at end of file