aboutsummaryrefslogtreecommitdiff
path: root/final/libomptarget/src/omptarget.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'final/libomptarget/src/omptarget.cpp')
-rw-r--r--final/libomptarget/src/omptarget.cpp714
1 files changed, 714 insertions, 0 deletions
diff --git a/final/libomptarget/src/omptarget.cpp b/final/libomptarget/src/omptarget.cpp
new file mode 100644
index 0000000..3cc09b4
--- /dev/null
+++ b/final/libomptarget/src/omptarget.cpp
@@ -0,0 +1,714 @@
+//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <vector>
+
+#ifdef OMPTARGET_DEBUG
+int DebugLevel = 0;
+#endif // OMPTARGET_DEBUG
+
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ * int a; // 4-aligned
+ * int b; // 4-aligned
+ * int *p; // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ * s1.b = 5;
+ * for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
+/// Map global data and execute pending ctors
+static int InitLibrary(DeviceTy& Device) {
+ /*
+ * Map global data
+ */
+ int32_t device_id = Device.DeviceID;
+ int rc = OFFLOAD_SUCCESS;
+
+ Device.PendingGlobalsMtx.lock();
+ TrlTblMtx.lock();
+ for (HostEntriesBeginToTransTableTy::iterator
+ ii = HostEntriesBeginToTransTable.begin();
+ ii != HostEntriesBeginToTransTable.end(); ++ii) {
+ TranslationTable *TransTable = &ii->second;
+ if (TransTable->TargetsTable[device_id] != 0) {
+ // Library entries have already been processed
+ continue;
+ }
+
+ // 1) get image.
+ assert(TransTable->TargetsImages.size() > (size_t)device_id &&
+ "Not expecting a device ID outside the table's bounds!");
+ __tgt_device_image *img = TransTable->TargetsImages[device_id];
+ if (!img) {
+ DP("No image loaded for device id %d.\n", device_id);
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+ // 2) load image into the target table.
+ __tgt_target_table *TargetTable =
+ TransTable->TargetsTable[device_id] = Device.load_binary(img);
+ // Unable to get table for this image: invalidate image and fail.
+ if (!TargetTable) {
+ DP("Unable to generate entries table for device id %d.\n", device_id);
+ TransTable->TargetsImages[device_id] = 0;
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+
+ // Verify whether the two table sizes match.
+ size_t hsize =
+ TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
+ size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
+
+ // Invalid image for these host entries!
+ if (hsize != tsize) {
+ DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
+ device_id, hsize, tsize);
+ TransTable->TargetsImages[device_id] = 0;
+ TransTable->TargetsTable[device_id] = 0;
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+
+ // process global data that needs to be mapped.
+ Device.DataMapMtx.lock();
+ __tgt_target_table *HostTable = &TransTable->HostTable;
+ for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
+ *CurrHostEntry = HostTable->EntriesBegin,
+ *EntryDeviceEnd = TargetTable->EntriesEnd;
+ CurrDeviceEntry != EntryDeviceEnd;
+ CurrDeviceEntry++, CurrHostEntry++) {
+ if (CurrDeviceEntry->size != 0) {
+ // has data.
+ assert(CurrDeviceEntry->size == CurrHostEntry->size &&
+ "data size mismatch");
+
+ // Fortran may use multiple weak declarations for the same symbol,
+ // therefore we must allow for multiple weak symbols to be loaded from
+ // the fat binary. Treat these mappings as any other "regular" mapping.
+ // Add entry to map.
+ if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
+ continue;
+ DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
+ "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
+ CurrDeviceEntry->size);
+ Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+ (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+ (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+ (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
+ (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
+ INF_REF_CNT /*RefCount*/));
+ }
+ }
+ Device.DataMapMtx.unlock();
+ }
+ TrlTblMtx.unlock();
+
+ if (rc != OFFLOAD_SUCCESS) {
+ Device.PendingGlobalsMtx.unlock();
+ return rc;
+ }
+
+ /*
+ * Run ctors for static objects
+ */
+ if (!Device.PendingCtorsDtors.empty()) {
+ // Call all ctors for all libraries registered so far
+ for (auto &lib : Device.PendingCtorsDtors) {
+ if (!lib.second.PendingCtors.empty()) {
+ DP("Has pending ctors... call now\n");
+ for (auto &entry : lib.second.PendingCtors) {
+ void *ctor = entry;
+ int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
+ NULL, 1, 1, true /*team*/);
+ if (rc != OFFLOAD_SUCCESS) {
+ DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
+ Device.PendingGlobalsMtx.unlock();
+ return OFFLOAD_FAIL;
+ }
+ }
+ // Clear the list to indicate that this device has been used
+ lib.second.PendingCtors.clear();
+ DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
+ }
+ }
+ }
+ Device.HasPendingGlobals = false;
+ Device.PendingGlobalsMtx.unlock();
+
+ return OFFLOAD_SUCCESS;
+}
+
+// Check whether a device has been initialized, global ctors have been
+// executed and global data has been mapped; do so if not already done.
+int CheckDeviceAndCtors(int64_t device_id) {
+ // Is device ready?
+ if (!device_is_ready(device_id)) {
+ DP("Device %" PRId64 " is not ready.\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+ // Get device info.
+ DeviceTy &Device = Devices[device_id];
+
+ // Check whether global data has been mapped for this device
+ Device.PendingGlobalsMtx.lock();
+ bool hasPendingGlobals = Device.HasPendingGlobals;
+ Device.PendingGlobalsMtx.unlock();
+ if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
+ DP("Failed to init globals on device %" PRId64 "\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+static int32_t member_of(int64_t type) {
+ return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
+}
+
+/// Internal function to do the mapping and transfer the data to the device
+int target_data_begin(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ // process each input.
+ int rc = OFFLOAD_SUCCESS;
+ for (int32_t i = 0; i < arg_num; ++i) {
+ // Ignore private variables and arrays - there is no mapping for them.
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ void *HstPtrBase = args_base[i];
+ int64_t data_size = arg_sizes[i];
+
+ // Adjust for proper alignment if this is a combined entry (for structs).
+ // Look at the next argument - if that is MEMBER_OF this one, then this one
+ // is a combined entry.
+ int64_t padding = 0;
+ const int next_i = i+1;
+ if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+ member_of(arg_types[next_i]) == i) {
+ padding = (int64_t)HstPtrBegin % alignment;
+ if (padding) {
+ DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+ "\n", padding, DPxPTR(HstPtrBegin));
+ HstPtrBegin = (char *) HstPtrBegin - padding;
+ data_size += padding;
+ }
+ }
+
+ // Address of pointer on the host and device, respectively.
+ void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
+ bool IsNew, Pointer_IsNew;
+ bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+ // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+ // have reached this point via __tgt_target_data_begin and not __tgt_target
+ // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+ // associated with a target region, so there are no target parameters). This
+ // may be considered a hack, we could revise the scheme in the future.
+ bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
+ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ DP("Has a pointer entry: \n");
+ // base is address of pointer.
+ Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
+ sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
+ if (!Pointer_TgtPtrBegin) {
+ DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+ "illegal mapping).\n");
+ }
+ DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
+ "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
+ (Pointer_IsNew ? "" : " not"));
+ Pointer_HstPtrBegin = HstPtrBase;
+ // modify current entry.
+ HstPtrBase = *(void **)HstPtrBase;
+ UpdateRef = true; // subsequently update ref count of pointee
+ }
+
+ void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
+ data_size, IsNew, IsImplicit, UpdateRef);
+ if (!TgtPtrBegin && data_size) {
+ // If data_size==0, then the argument could be a zero-length pointer to
+ // NULL, so getOrAlloc() returning NULL is not an error.
+ DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+ "illegal mapping).\n");
+ }
+ DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+ " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
+ (IsNew ? "" : " not"));
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
+ uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+ void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+ DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+ args_base[i] = TgtPtrBase;
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ bool copy = false;
+ if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
+ copy = true;
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ copy = true;
+ }
+ }
+
+ if (copy) {
+ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+ data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
+ DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+ void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
+ int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
+ sizeof(void *));
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ // create shadow pointers for this entry
+ Device.ShadowMtx.lock();
+ Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
+ Pointer_TgtPtrBegin, TgtPtrBase};
+ Device.ShadowMtx.unlock();
+ }
+ }
+
+ return rc;
+}
+
+/// Internal function to undo the mapping and retrieve the data from the device.
+int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+ void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ int rc = OFFLOAD_SUCCESS;
+ // process each input.
+ for (int32_t i = arg_num - 1; i >= 0; --i) {
+ // Ignore private variables and arrays - there is no mapping for them.
+ // Also, ignore the use_device_ptr directive, it has no effect here.
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ int64_t data_size = arg_sizes[i];
+ // Adjust for proper alignment if this is a combined entry (for structs).
+ // Look at the next argument - if that is MEMBER_OF this one, then this one
+ // is a combined entry.
+ int64_t padding = 0;
+ const int next_i = i+1;
+ if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+ member_of(arg_types[next_i]) == i) {
+ padding = (int64_t)HstPtrBegin % alignment;
+ if (padding) {
+ DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+ "\n", padding, DPxPTR(HstPtrBegin));
+ HstPtrBegin = (char *) HstPtrBegin - padding;
+ data_size += padding;
+ }
+ }
+
+ bool IsLast;
+ bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
+ bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
+
+ // If PTR_AND_OBJ, HstPtrBegin is address of pointee
+ void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
+ UpdateRef);
+ DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+ " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
+ (IsLast ? "" : " not"));
+
+ bool DelEntry = IsLast || ForceDelete;
+
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+ !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+ DelEntry = false; // protect parent struct from being deallocated
+ }
+
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
+ // Move data back to the host
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
+ bool CopyMember = false;
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+ !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ CopyMember = true;
+ }
+ }
+
+ if (DelEntry || Always || CopyMember) {
+ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+ data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+ int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data from device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+
+ // If we copied back to the host a struct/array containing pointers, we
+ // need to restore the original host pointer values from their shadow
+ // copies. If the struct is going to be deallocated, remove any remaining
+ // shadow pointer entries for this struct.
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+
+ // An STL map is sorted on its keys; use this property
+ // to quickly determine when to break out of the loop.
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+
+ // If we copied the struct to the host, we need to restore the pointer.
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ DP("Restoring original host pointer value " DPxMOD " for host "
+ "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+ DPxPTR(ShadowHstPtrAddr));
+ *ShadowHstPtrAddr = it->second.HstPtrVal;
+ }
+ // If the struct is to be deallocated, remove the shadow entry.
+ if (DelEntry) {
+ DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+ Device.ShadowPtrMap.erase(it);
+ }
+ }
+ Device.ShadowMtx.unlock();
+
+ // Deallocate map
+ if (DelEntry) {
+ int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Deallocating data from device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+ }
+
+ return rc;
+}
+
+/// Internal function to pass data to/from the target.
+void target_data_update(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ // process each input.
+ for (int32_t i = 0; i < arg_num; ++i) {
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ int64_t MapSize = arg_sizes[i];
+ bool IsLast;
+ void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
+ false);
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+ arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+ Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
+
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+ DP("Restoring original host pointer value " DPxMOD " for host pointer "
+ DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+ DPxPTR(ShadowHstPtrAddr));
+ *ShadowHstPtrAddr = it->second.HstPtrVal;
+ }
+ Device.ShadowMtx.unlock();
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+ arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+ Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
+
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+ DP("Restoring original target pointer value " DPxMOD " for target "
+ "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
+ DPxPTR(it->second.TgtPtrAddr));
+ Device.data_submit(it->second.TgtPtrAddr,
+ &it->second.TgtPtrVal, sizeof(void *));
+ }
+ Device.ShadowMtx.unlock();
+ }
+ }
+}
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer different from zero otherwise.
+int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+ int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
+ DeviceTy &Device = Devices[device_id];
+
+ // Find the table information in the map or look it up in the translation
+ // tables.
+ TableMap *TM = 0;
+ TblMapMtx.lock();
+ HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
+ if (TableMapIt == HostPtrToTableMap.end()) {
+ // We don't have a map. So search all the registered libraries.
+ TrlTblMtx.lock();
+ for (HostEntriesBeginToTransTableTy::iterator
+ ii = HostEntriesBeginToTransTable.begin(),
+ ie = HostEntriesBeginToTransTable.end();
+ !TM && ii != ie; ++ii) {
+ // get the translation table (which contains all the good info).
+ TranslationTable *TransTable = &ii->second;
+ // iterate over all the host table entries to see if we can locate the
+ // host_ptr.
+ __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
+ __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
+ __tgt_offload_entry *cur = begin;
+ for (uint32_t i = 0; cur < end; ++cur, ++i) {
+ if (cur->addr != host_ptr)
+ continue;
+ // we got a match, now fill the HostPtrToTableMap so that we
+ // may avoid this search next time.
+ TM = &HostPtrToTableMap[host_ptr];
+ TM->Table = TransTable;
+ TM->Index = i;
+ break;
+ }
+ }
+ TrlTblMtx.unlock();
+ } else {
+ TM = &TableMapIt->second;
+ }
+ TblMapMtx.unlock();
+
+ // No map for this host pointer found!
+ if (!TM) {
+ DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+ DPxPTR(host_ptr));
+ return OFFLOAD_FAIL;
+ }
+
+ // get target table.
+ TrlTblMtx.lock();
+ assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
+ "Not expecting a device ID outside the table's bounds!");
+ __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
+ TrlTblMtx.unlock();
+ assert(TargetTable && "Global data has not been mapped\n");
+
+ // Move data to device.
+ int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
+ arg_types);
+
+ if (rc != OFFLOAD_SUCCESS) {
+ DP("Call to target_data_begin failed, skipping target execution.\n");
+ // Call target_data_end to dealloc whatever target_data_begin allocated
+ // and return OFFLOAD_FAIL.
+ target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
+ return OFFLOAD_FAIL;
+ }
+
+ std::vector<void *> tgt_args;
+ std::vector<ptrdiff_t> tgt_offsets;
+
+ // List of (first-)private arrays allocated for this target region
+ std::vector<void *> fpArrays;
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
+ // This is not a target parameter, do not push it into tgt_args.
+ continue;
+ }
+ void *HstPtrBegin = args[i];
+ void *HstPtrBase = args_base[i];
+ void *TgtPtrBegin;
+ ptrdiff_t TgtBaseOffset;
+ bool IsLast; // unused.
+ if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
+ DP("Forwarding first-private value " DPxMOD " to the target construct\n",
+ DPxPTR(HstPtrBase));
+ TgtPtrBegin = HstPtrBase;
+ TgtBaseOffset = 0;
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
+ // Allocate memory for (first-)private array
+ TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
+ arg_sizes[i], HstPtrBegin);
+ if (!TgtPtrBegin) {
+ DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
+ (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+ DPxPTR(HstPtrBegin));
+ rc = OFFLOAD_FAIL;
+ break;
+ } else {
+ fpArrays.push_back(TgtPtrBegin);
+ TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+ void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+ DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
+ "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
+ arg_sizes[i], DPxPTR(TgtPtrBegin),
+ (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+ DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
+#endif
+ // If first-private, copy data from host
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP ("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+ }
+ }
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
+ false);
+ TgtBaseOffset = 0; // no offset for ptrs.
+ DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
+ "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
+ DPxPTR(HstPtrBase));
+ } else {
+ TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+ false);
+ TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+ void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+ DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
+ DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+#endif
+ }
+ tgt_args.push_back(TgtPtrBegin);
+ tgt_offsets.push_back(TgtBaseOffset);
+ }
+
+ assert(tgt_args.size() == tgt_offsets.size() &&
+ "Size mismatch in arguments and offsets");
+
+ // Pop loop trip count
+ uint64_t ltc = Device.loopTripCnt;
+ Device.loopTripCnt = 0;
+
+ // Launch device execution.
+ if (rc == OFFLOAD_SUCCESS) {
+ DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+ TargetTable->EntriesBegin[TM->Index].name,
+ DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
+ if (IsTeamConstruct) {
+ rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
+ thread_limit, ltc);
+ } else {
+ rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+ }
+ } else {
+ DP("Errors occurred while obtaining target arguments, skipping kernel "
+ "execution\n");
+ }
+
+ // Deallocate (first-)private arrays
+ for (auto it : fpArrays) {
+ int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Deallocation of (first-)private arrays failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+
+ // Move data from device.
+ int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
+ arg_types);
+
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Call to target_data_end failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+
+ return rc;
+}