aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Wennborg <hans@hanshq.net>2018-09-17 11:37:57 +0000
committerHans Wennborg <hans@hanshq.net>2018-09-17 11:37:57 +0000
commitb6d3a994aea4dc7ca139635624e8c80617436f9a (patch)
tree07778fe8f0038268045ce4b1050cd53bbedbc838
parent9ade647b2f773ffe2793fce58f480b41a4283e6f (diff)
downloadopenmp-svn-tags/RELEASE_700.tar.gz
Creating release candidate final from release_700 branchsvn-tags/RELEASE_700
git-svn-id: https://llvm.org/svn/llvm-project/openmp/tags/RELEASE_700@342381 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--final/.arcconfig4
-rw-r--r--final/.gitignore42
-rw-r--r--final/CMakeLists.txt83
-rw-r--r--final/CREDITS.txt61
-rw-r--r--final/LICENSE.txt174
-rw-r--r--final/README.rst343
-rw-r--r--final/cmake/DetectTestCompiler/CMakeLists.txt39
-rw-r--r--final/cmake/HandleOpenMPOptions.cmake16
-rw-r--r--final/cmake/OpenMPTesting.cmake181
-rw-r--r--final/cmake/config-ix.cmake6
-rw-r--r--final/libomptarget/CMakeLists.txt73
-rw-r--r--final/libomptarget/README.txt73
-rw-r--r--final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake160
-rw-r--r--final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake112
-rw-r--r--final/libomptarget/cmake/Modules/LibomptargetUtils.cmake28
-rw-r--r--final/libomptarget/deviceRTLs/CMakeLists.txt14
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt181
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt523
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/cancel.cu28
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/counter_group.h51
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h82
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/critical.cu32
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu513
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/debug.h276
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/interface.h523
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/libcall.cu462
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/loop.cu769
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu59
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu194
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h441
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h218
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/option.h70
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/parallel.cu479
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/reduction.cu429
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/state-queue.h52
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h89
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/support.h92
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/supporti.h215
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/sync.cu153
-rw-r--r--final/libomptarget/deviceRTLs/nvptx/src/task.cu208
-rw-r--r--final/libomptarget/include/omptarget.h233
-rw-r--r--final/libomptarget/include/omptargetplugin.h92
-rw-r--r--final/libomptarget/plugins/CMakeLists.txt72
-rw-r--r--final/libomptarget/plugins/aarch64/CMakeLists.txt18
-rw-r--r--final/libomptarget/plugins/common/elf_common.c73
-rw-r--r--final/libomptarget/plugins/cuda/CMakeLists.txt50
-rw-r--r--final/libomptarget/plugins/cuda/src/rtl.cpp758
-rw-r--r--final/libomptarget/plugins/exports15
-rw-r--r--final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp340
-rw-r--r--final/libomptarget/plugins/ppc64/CMakeLists.txt18
-rw-r--r--final/libomptarget/plugins/ppc64le/CMakeLists.txt18
-rw-r--r--final/libomptarget/plugins/x86_64/CMakeLists.txt18
-rw-r--r--final/libomptarget/src/CMakeLists.txt31
-rw-r--r--final/libomptarget/src/api.cpp283
-rw-r--r--final/libomptarget/src/device.cpp365
-rw-r--r--final/libomptarget/src/device.h167
-rw-r--r--final/libomptarget/src/exports28
-rw-r--r--final/libomptarget/src/interface.cpp251
-rw-r--r--final/libomptarget/src/omptarget.cpp714
-rw-r--r--final/libomptarget/src/private.h59
-rw-r--r--final/libomptarget/src/rtl.cpp368
-rw-r--r--final/libomptarget/src/rtl.h166
-rw-r--r--final/libomptarget/test/CMakeLists.txt28
-rw-r--r--final/libomptarget/test/env/omp_target_debug.c20
-rw-r--r--final/libomptarget/test/lit.cfg137
-rw-r--r--final/libomptarget/test/lit.site.cfg.in18
-rw-r--r--final/libomptarget/test/offloading/offloading_success.c23
-rw-r--r--final/libomptarget/test/offloading/offloading_success.cpp23
-rw-r--r--final/runtime/.clang-format5
-rw-r--r--final/runtime/CMakeLists.txt397
-rw-r--r--final/runtime/README.txt116
-rw-r--r--final/runtime/cmake/LibompCheckFortranFlag.cmake73
-rw-r--r--final/runtime/cmake/LibompCheckLinkerFlag.cmake68
-rw-r--r--final/runtime/cmake/LibompDefinitions.cmake32
-rw-r--r--final/runtime/cmake/LibompExports.cmake99
-rw-r--r--final/runtime/cmake/LibompGetArchitecture.cmake70
-rw-r--r--final/runtime/cmake/LibompHandleFlags.cmake208
-rw-r--r--final/runtime/cmake/LibompMicroTests.cmake228
-rw-r--r--final/runtime/cmake/LibompUtils.cmake195
-rw-r--r--final/runtime/cmake/config-ix.cmake281
-rw-r--r--final/runtime/doc/Reference.pdf13882
-rw-r--r--final/runtime/doc/doxygen/config1822
-rw-r--r--final/runtime/doc/doxygen/header.tex77
-rw-r--r--final/runtime/doc/doxygen/libomp_interface.h332
-rw-r--r--final/runtime/src/CMakeLists.txt332
-rw-r--r--final/runtime/src/dllexports1188
-rw-r--r--final/runtime/src/exports_so.txt125
-rw-r--r--final/runtime/src/extractExternal.cpp484
-rw-r--r--final/runtime/src/i18n/en_US.txt491
-rw-r--r--final/runtime/src/include/30/omp.h.var165
-rw-r--r--final/runtime/src/include/30/omp_lib.f.var644
-rw-r--r--final/runtime/src/include/30/omp_lib.f90.var365
-rw-r--r--final/runtime/src/include/30/omp_lib.h.var649
-rw-r--r--final/runtime/src/include/40/omp.h.var161
-rw-r--r--final/runtime/src/include/40/omp_lib.f.var774
-rw-r--r--final/runtime/src/include/40/omp_lib.f90.var455
-rw-r--r--final/runtime/src/include/40/omp_lib.h.var567
-rw-r--r--final/runtime/src/include/45/omp.h.var198
-rw-r--r--final/runtime/src/include/45/omp_lib.f.var855
-rw-r--r--final/runtime/src/include/45/omp_lib.f90.var524
-rw-r--r--final/runtime/src/include/45/omp_lib.h.var644
-rw-r--r--final/runtime/src/include/50/omp.h.var215
-rw-r--r--final/runtime/src/include/50/omp_lib.f.var868
-rw-r--r--final/runtime/src/include/50/omp_lib.f90.var543
-rw-r--r--final/runtime/src/include/50/omp_lib.h.var663
-rw-r--r--final/runtime/src/include/50/ompt.h.var697
-rw-r--r--final/runtime/src/kmp.h3890
-rw-r--r--final/runtime/src/kmp_affinity.cpp5327
-rw-r--r--final/runtime/src/kmp_affinity.h829
-rw-r--r--final/runtime/src/kmp_alloc.cpp1874
-rw-r--r--final/runtime/src/kmp_atomic.cpp3632
-rw-r--r--final/runtime/src/kmp_atomic.h1776
-rw-r--r--final/runtime/src/kmp_barrier.cpp2045
-rw-r--r--final/runtime/src/kmp_cancel.cpp336
-rw-r--r--final/runtime/src/kmp_config.h.cmake114
-rw-r--r--final/runtime/src/kmp_csupport.cpp4095
-rw-r--r--final/runtime/src/kmp_debug.cpp132
-rw-r--r--final/runtime/src/kmp_debug.h171
-rw-r--r--final/runtime/src/kmp_debugger.cpp291
-rw-r--r--final/runtime/src/kmp_debugger.h49
-rw-r--r--final/runtime/src/kmp_dispatch.cpp2604
-rw-r--r--final/runtime/src/kmp_dispatch.h522
-rw-r--r--final/runtime/src/kmp_dispatch_hier.h1090
-rw-r--r--final/runtime/src/kmp_environment.cpp501
-rw-r--r--final/runtime/src/kmp_environment.h78
-rw-r--r--final/runtime/src/kmp_error.cpp462
-rw-r--r--final/runtime/src/kmp_error.h61
-rw-r--r--final/runtime/src/kmp_ftn_cdecl.cpp35
-rw-r--r--final/runtime/src/kmp_ftn_entry.h1315
-rw-r--r--final/runtime/src/kmp_ftn_extra.cpp33
-rw-r--r--final/runtime/src/kmp_ftn_os.h640
-rw-r--r--final/runtime/src/kmp_ftn_stdcall.cpp33
-rw-r--r--final/runtime/src/kmp_global.cpp517
-rw-r--r--final/runtime/src/kmp_gsupport.cpp2002
-rw-r--r--final/runtime/src/kmp_i18n.cpp872
-rw-r--r--final/runtime/src/kmp_i18n.h179
-rw-r--r--final/runtime/src/kmp_import.cpp34
-rw-r--r--final/runtime/src/kmp_io.cpp226
-rw-r--r--final/runtime/src/kmp_io.h38
-rw-r--r--final/runtime/src/kmp_itt.cpp161
-rw-r--r--final/runtime/src/kmp_itt.h333
-rw-r--r--final/runtime/src/kmp_itt.inl1043
-rw-r--r--final/runtime/src/kmp_lock.cpp3893
-rw-r--r--final/runtime/src/kmp_lock.h1296
-rw-r--r--final/runtime/src/kmp_omp.h242
-rw-r--r--final/runtime/src/kmp_os.h953
-rw-r--r--final/runtime/src/kmp_platform.h188
-rw-r--r--final/runtime/src/kmp_runtime.cpp7757
-rw-r--r--final/runtime/src/kmp_safe_c_api.h60
-rw-r--r--final/runtime/src/kmp_sched.cpp1001
-rw-r--r--final/runtime/src/kmp_settings.cpp5620
-rw-r--r--final/runtime/src/kmp_settings.h65
-rw-r--r--final/runtime/src/kmp_stats.cpp922
-rw-r--r--final/runtime/src/kmp_stats.h1002
-rw-r--r--final/runtime/src/kmp_stats_timing.cpp131
-rw-r--r--final/runtime/src/kmp_stats_timing.h116
-rw-r--r--final/runtime/src/kmp_str.cpp731
-rw-r--r--final/runtime/src/kmp_str.h125
-rw-r--r--final/runtime/src/kmp_stub.cpp339
-rw-r--r--final/runtime/src/kmp_stub.h59
-rw-r--r--final/runtime/src/kmp_taskdeps.cpp673
-rw-r--r--final/runtime/src/kmp_tasking.cpp4210
-rw-r--r--final/runtime/src/kmp_taskq.cpp2029
-rw-r--r--final/runtime/src/kmp_threadprivate.cpp800
-rw-r--r--final/runtime/src/kmp_utility.cpp406
-rw-r--r--final/runtime/src/kmp_version.cpp208
-rw-r--r--final/runtime/src/kmp_version.h67
-rw-r--r--final/runtime/src/kmp_wait_release.cpp26
-rw-r--r--final/runtime/src/kmp_wait_release.h917
-rw-r--r--final/runtime/src/kmp_wrapper_getpid.h68
-rw-r--r--final/runtime/src/kmp_wrapper_malloc.h195
-rw-r--r--final/runtime/src/libomp.rc.var70
-rw-r--r--final/runtime/src/ompt-event-specific.h110
-rw-r--r--final/runtime/src/ompt-general.cpp706
-rw-r--r--final/runtime/src/ompt-internal.h128
-rw-r--r--final/runtime/src/ompt-specific.cpp451
-rw-r--r--final/runtime/src/ompt-specific.h107
-rw-r--r--final/runtime/src/test-touch.c31
-rw-r--r--final/runtime/src/thirdparty/ittnotify/disable_warnings.h29
-rw-r--r--final/runtime/src/thirdparty/ittnotify/ittnotify.h3804
-rw-r--r--final/runtime/src/thirdparty/ittnotify/ittnotify_config.h490
-rw-r--r--final/runtime/src/thirdparty/ittnotify/ittnotify_static.c1057
-rw-r--r--final/runtime/src/thirdparty/ittnotify/ittnotify_static.h316
-rw-r--r--final/runtime/src/thirdparty/ittnotify/ittnotify_types.h67
-rw-r--r--final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h972
-rw-r--r--final/runtime/src/tsan_annotations.cpp108
-rw-r--r--final/runtime/src/tsan_annotations.h170
-rw-r--r--final/runtime/src/z_Linux_asm.S1730
-rw-r--r--final/runtime/src/z_Linux_util.cpp2378
-rw-r--r--final/runtime/src/z_Windows_NT-586_asm.asm1299
-rw-r--r--final/runtime/src/z_Windows_NT-586_util.cpp136
-rw-r--r--final/runtime/src/z_Windows_NT_util.cpp1568
-rw-r--r--final/runtime/test/CMakeLists.txt37
-rw-r--r--final/runtime/test/api/has_openmp.c23
-rw-r--r--final/runtime/test/api/kmp_aligned_malloc.c62
-rw-r--r--final/runtime/test/api/kmp_set_defaults_lock_bug.c53
-rw-r--r--final/runtime/test/api/omp_get_num_threads.c39
-rw-r--r--final/runtime/test/api/omp_get_wtick.c24
-rw-r--r--final/runtime/test/api/omp_get_wtime.c33
-rw-r--r--final/runtime/test/api/omp_in_parallel.c39
-rw-r--r--final/runtime/test/atomic/omp_atomic.c366
-rw-r--r--final/runtime/test/barrier/omp_barrier.c44
-rw-r--r--final/runtime/test/critical/omp_critical.c37
-rw-r--r--final/runtime/test/env/kmp_aff_disable_hwloc.c21
-rw-r--r--final/runtime/test/env/kmp_set_dispatch_buf.c76
-rw-r--r--final/runtime/test/env/omp_thread_limit.c82
-rw-r--r--final/runtime/test/env/omp_wait_policy.c40
-rw-r--r--final/runtime/test/flush/omp_flush.c45
-rw-r--r--final/runtime/test/lit.cfg130
-rw-r--r--final/runtime/test/lit.site.cfg.in20
-rw-r--r--final/runtime/test/lock/omp_init_lock.c42
-rw-r--r--final/runtime/test/lock/omp_lock.c47
-rw-r--r--final/runtime/test/lock/omp_nest_lock.c45
-rw-r--r--final/runtime/test/lock/omp_test_lock.c47
-rw-r--r--final/runtime/test/lock/omp_test_nest_lock.c47
-rw-r--r--final/runtime/test/master/omp_master.c38
-rw-r--r--final/runtime/test/master/omp_master_3.c44
-rw-r--r--final/runtime/test/misc_bugs/cancellation_for_sections.c64
-rw-r--r--final/runtime/test/misc_bugs/many-microtask-args.c39
-rw-r--r--final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c81
-rw-r--r--final/runtime/test/misc_bugs/teams-no-par.c64
-rw-r--r--final/runtime/test/misc_bugs/teams-reduction.c68
-rw-r--r--final/runtime/test/omp_my_sleep.h33
-rw-r--r--final/runtime/test/omp_testsuite.h79
-rwxr-xr-xfinal/runtime/test/ompt/callback.h764
-rw-r--r--final/runtime/test/ompt/cancel/cancel_parallel.c40
-rw-r--r--final/runtime/test/ompt/cancel/cancel_taskgroup.c89
-rw-r--r--final/runtime/test/ompt/cancel/cancel_worksharing.c67
-rw-r--r--final/runtime/test/ompt/loadtool/tool_available/tool_available.c74
-rw-r--r--final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c104
-rw-r--r--final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c69
-rw-r--r--final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp92
-rw-r--r--final/runtime/test/ompt/misc/api_calls_misc.c72
-rw-r--r--final/runtime/test/ompt/misc/api_calls_places.c88
-rw-r--r--final/runtime/test/ompt/misc/control_tool.c29
-rw-r--r--final/runtime/test/ompt/misc/control_tool_no_ompt_support.c12
-rw-r--r--final/runtime/test/ompt/misc/idle.c32
-rw-r--r--final/runtime/test/ompt/misc/interoperability.cpp115
-rw-r--r--final/runtime/test/ompt/misc/threads.c34
-rw-r--r--final/runtime/test/ompt/misc/threads_nested.c40
-rw-r--r--final/runtime/test/ompt/misc/unset_callback.c29
-rw-r--r--final/runtime/test/ompt/ompt-signal.h31
-rw-r--r--final/runtime/test/ompt/parallel/dynamic_enough_threads.c43
-rw-r--r--final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c43
-rw-r--r--final/runtime/test/ompt/parallel/max_active_levels_serialized.c73
-rw-r--r--final/runtime/test/ompt/parallel/nested.c298
-rw-r--r--final/runtime/test/ompt/parallel/nested_lwt.c334
-rw-r--r--final/runtime/test/ompt/parallel/nested_serialized.c128
-rw-r--r--final/runtime/test/ompt/parallel/nested_thread_num.c357
-rw-r--r--final/runtime/test/ompt/parallel/no_thread_num_clause.c95
-rw-r--r--final/runtime/test/ompt/parallel/normal.c132
-rw-r--r--final/runtime/test/ompt/parallel/not_enough_threads.c90
-rw-r--r--final/runtime/test/ompt/parallel/parallel_if0.c76
-rw-r--r--final/runtime/test/ompt/parallel/serialized.c77
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/explicit.c58
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/for_loop.c56
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/for_simd.c33
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c150
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/parallel_region.c40
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/sections.c63
-rw-r--r--final/runtime/test/ompt/synchronization/barrier/single.c61
-rw-r--r--final/runtime/test/ompt/synchronization/critical.c32
-rw-r--r--final/runtime/test/ompt/synchronization/flush.c30
-rw-r--r--final/runtime/test/ompt/synchronization/lock.c44
-rw-r--r--final/runtime/test/ompt/synchronization/master.c38
-rw-r--r--final/runtime/test/ompt/synchronization/nest_lock.c52
-rw-r--r--final/runtime/test/ompt/synchronization/ordered.c32
-rw-r--r--final/runtime/test/ompt/synchronization/taskgroup.c49
-rw-r--r--final/runtime/test/ompt/synchronization/taskwait.c36
-rw-r--r--final/runtime/test/ompt/synchronization/test_lock.c54
-rw-r--r--final/runtime/test/ompt/synchronization/test_nest_lock.c42
-rw-r--r--final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c60
-rw-r--r--final/runtime/test/ompt/tasks/dependences.c61
-rw-r--r--final/runtime/test/ompt/tasks/explicit_task.c102
-rw-r--r--final/runtime/test/ompt/tasks/serialized.c154
-rw-r--r--final/runtime/test/ompt/tasks/task_in_joinbarrier.c91
-rw-r--r--final/runtime/test/ompt/tasks/task_types.c222
-rw-r--r--final/runtime/test/ompt/tasks/task_types_serialized.c113
-rw-r--r--final/runtime/test/ompt/tasks/taskloop.c81
-rw-r--r--final/runtime/test/ompt/tasks/taskyield.c62
-rw-r--r--final/runtime/test/ompt/tasks/untied_task.c108
-rw-r--r--final/runtime/test/ompt/worksharing/for/auto.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/auto_serialized.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/auto_split.c8
-rw-r--r--final/runtime/test/ompt/worksharing/for/base.h43
-rw-r--r--final/runtime/test/ompt/worksharing/for/base_serialized.h28
-rw-r--r--final/runtime/test/ompt/worksharing/for/base_split.h66
-rw-r--r--final/runtime/test/ompt/worksharing/for/dynamic.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/dynamic_serialized.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/dynamic_split.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/guided.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/guided_serialized.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/guided_split.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/runtime.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/runtime_serialized.c5
-rw-r--r--final/runtime/test/ompt/worksharing/for/runtime_split.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/static.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/static_serialized.c7
-rw-r--r--final/runtime/test/ompt/worksharing/for/static_split.c8
-rw-r--r--final/runtime/test/ompt/worksharing/sections.c36
-rw-r--r--final/runtime/test/ompt/worksharing/single.c36
-rw-r--r--final/runtime/test/parallel/omp_nested.c46
-rw-r--r--final/runtime/test/parallel/omp_parallel_copyin.c47
-rw-r--r--final/runtime/test/parallel/omp_parallel_default.c43
-rw-r--r--final/runtime/test/parallel/omp_parallel_firstprivate.c46
-rw-r--r--final/runtime/test/parallel/omp_parallel_if.c40
-rw-r--r--final/runtime/test/parallel/omp_parallel_num_threads.c46
-rw-r--r--final/runtime/test/parallel/omp_parallel_private.c46
-rw-r--r--final/runtime/test/parallel/omp_parallel_reduction.c254
-rw-r--r--final/runtime/test/parallel/omp_parallel_shared.c46
-rw-r--r--final/runtime/test/tasking/bug_36720.c36
-rw-r--r--final/runtime/test/tasking/bug_nested_proxy_task.c131
-rw-r--r--final/runtime/test/tasking/bug_proxy_task_dep_waiting.c134
-rw-r--r--final/runtime/test/tasking/bug_serial_taskgroup.c16
-rw-r--r--final/runtime/test/tasking/kmp_task_reduction_nest.cpp376
-rw-r--r--final/runtime/test/tasking/kmp_taskloop.c159
-rw-r--r--final/runtime/test/tasking/nested_parallel_tasking.c32
-rw-r--r--final/runtime/test/tasking/nested_task_creation.c35
-rw-r--r--final/runtime/test/tasking/omp_task.c52
-rw-r--r--final/runtime/test/tasking/omp_task_final.c65
-rw-r--r--final/runtime/test/tasking/omp_task_firstprivate.c51
-rw-r--r--final/runtime/test/tasking/omp_task_if.c43
-rw-r--r--final/runtime/test/tasking/omp_task_imp_firstprivate.c47
-rw-r--r--final/runtime/test/tasking/omp_task_priority.c22
-rw-r--r--final/runtime/test/tasking/omp_task_private.c53
-rw-r--r--final/runtime/test/tasking/omp_task_shared.c41
-rw-r--r--final/runtime/test/tasking/omp_taskloop_grainsize.c113
-rw-r--r--final/runtime/test/tasking/omp_taskloop_num_tasks.c71
-rw-r--r--final/runtime/test/tasking/omp_taskwait.c74
-rw-r--r--final/runtime/test/tasking/omp_taskyield.c58
-rw-r--r--final/runtime/test/threadprivate/omp_threadprivate.c102
-rw-r--r--final/runtime/test/threadprivate/omp_threadprivate_for.c48
-rw-r--r--final/runtime/test/worksharing/for/bug_set_schedule_0.c40
-rw-r--r--final/runtime/test/worksharing/for/kmp_doacross_check.c62
-rw-r--r--final/runtime/test/worksharing/for/kmp_sch_simd_guided.c410
-rw-r--r--final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c221
-rw-r--r--final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c196
-rw-r--r--final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c201
-rw-r--r--final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c91
-rw-r--r--final/runtime/test/worksharing/for/omp_doacross.c60
-rw-r--r--final/runtime/test/worksharing/for/omp_for_bigbounds.c70
-rw-r--r--final/runtime/test/worksharing/for/omp_for_collapse.c51
-rw-r--r--final/runtime/test/worksharing/for/omp_for_firstprivate.c55
-rw-r--r--final/runtime/test/worksharing/for/omp_for_lastprivate.c52
-rw-r--r--final/runtime/test/worksharing/for/omp_for_nowait.c77
-rw-r--r--final/runtime/test/worksharing/for/omp_for_ordered.c60
-rw-r--r--final/runtime/test/worksharing/for/omp_for_private.c63
-rw-r--r--final/runtime/test/worksharing/for/omp_for_reduction.c339
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_auto.c69
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c89
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_guided.c217
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_runtime.c82
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_static.c154
-rw-r--r--final/runtime/test/worksharing/for/omp_for_schedule_static_3.c202
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c35
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_if.c42
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c37
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_ordered.c64
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_private.c50
-rw-r--r--final/runtime/test/worksharing/for/omp_parallel_for_reduction.c266
-rw-r--r--final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c54
-rw-r--r--final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c71
-rw-r--r--final/runtime/test/worksharing/sections/omp_parallel_sections_private.c64
-rw-r--r--final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c508
-rw-r--r--final/runtime/test/worksharing/sections/omp_section_firstprivate.c55
-rw-r--r--final/runtime/test/worksharing/sections/omp_section_lastprivate.c76
-rw-r--r--final/runtime/test/worksharing/sections/omp_section_private.c66
-rw-r--r--final/runtime/test/worksharing/sections/omp_sections_nowait.c104
-rw-r--r--final/runtime/test/worksharing/sections/omp_sections_reduction.c543
-rw-r--r--final/runtime/test/worksharing/single/omp_single.c44
-rw-r--r--final/runtime/test/worksharing/single/omp_single_copyprivate.c60
-rw-r--r--final/runtime/test/worksharing/single/omp_single_nowait.c73
-rw-r--r--final/runtime/test/worksharing/single/omp_single_private.c57
-rwxr-xr-xfinal/runtime/tools/check-depends.pl506
-rwxr-xr-xfinal/runtime/tools/check-execstack.pl146
-rwxr-xr-xfinal/runtime/tools/check-instruction-set.pl321
-rwxr-xr-xfinal/runtime/tools/generate-def.pl321
-rw-r--r--final/runtime/tools/lib/Build.pm264
-rw-r--r--final/runtime/tools/lib/LibOMP.pm85
-rw-r--r--final/runtime/tools/lib/Platform.pm484
-rw-r--r--final/runtime/tools/lib/Uname.pm639
-rw-r--r--final/runtime/tools/lib/tools.pm1981
-rwxr-xr-xfinal/runtime/tools/message-converter.pl775
-rw-r--r--final/runtime/tools/summarizeStats.py323
-rw-r--r--final/www/README.txt116
-rw-r--r--final/www/Reference.pdf13882
-rw-r--r--final/www/content.css27
-rw-r--r--final/www/index.html235
-rw-r--r--final/www/menu.css39
389 files changed, 163903 insertions, 0 deletions
diff --git a/final/.arcconfig b/final/.arcconfig
new file mode 100644
index 0000000..bd06ac8
--- /dev/null
+++ b/final/.arcconfig
@@ -0,0 +1,4 @@
+{
+ "repository.callsign" : "OMP",
+ "conduit_uri" : "https://reviews.llvm.org/"
+}
diff --git a/final/.gitignore b/final/.gitignore
new file mode 100644
index 0000000..d4bec15
--- /dev/null
+++ b/final/.gitignore
@@ -0,0 +1,42 @@
+#==============================================================================#
+# This file specifies intentionally untracked files that git should ignore.
+# See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html
+#
+# This file is intentionally different from the output of `git svn show-ignore`,
+# as most of those are useless.
+#==============================================================================#
+
+#==============================================================================#
+# File extensions to be ignored anywhere in the tree.
+#==============================================================================#
+# Temp files created by most text editors.
+*~
+# Merge files created by git.
+*.orig
+# Byte compiled python modules.
+*.pyc
+# vim swap files
+.*.sw?
+.sw?
+#OS X specific files.
+.DS_store
+
+#==============================================================================#
+# Explicit files to ignore (only matches one).
+#==============================================================================#
+# Various tag programs
+tags
+/TAGS
+/GPATH
+/GRTAGS
+/GSYMS
+/GTAGS
+.gitusers
+
+#==============================================================================#
+# Directories to ignore (do not add trailing '/'s, they skip symlinks).
+#==============================================================================#
+runtime/exports
+
+# Nested build directory
+/build
diff --git a/final/CMakeLists.txt b/final/CMakeLists.txt
new file mode 100644
index 0000000..597eedc
--- /dev/null
+++ b/final/CMakeLists.txt
@@ -0,0 +1,83 @@
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# llvm/runtimes/ will set OPENMP_STANDALONE_BUILD.
+if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+ set(OPENMP_STANDALONE_BUILD TRUE)
+ project(openmp C CXX)
+
+ # CMAKE_BUILD_TYPE was not set, default to Release.
+ if (NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+ endif()
+
+ # Group common settings.
+ set(OPENMP_ENABLE_WERROR FALSE CACHE BOOL
+ "Enable -Werror flags to turn warnings into errors for supporting compilers.")
+ set(OPENMP_LIBDIR_SUFFIX "" CACHE STRING
+ "Suffix of lib installation directory, e.g. 64 => lib64")
+ # Do not use OPENMP_LIBDIR_SUFFIX directly, use OPENMP_INSTALL_LIBDIR.
+ set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}")
+
+ # Group test settings.
+ set(OPENMP_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING
+ "C compiler to use for testing OpenMP runtime libraries.")
+ set(OPENMP_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING
+ "C++ compiler to use for testing OpenMP runtime libraries.")
+ set(OPENMP_LLVM_TOOLS_DIR "" CACHE PATH "Path to LLVM tools for testing.")
+else()
+ set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
+ # If building in tree, we honor the same install suffix LLVM uses.
+ set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}")
+
+ if (NOT MSVC)
+ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
+ set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++)
+ else()
+ set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
+ set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
+ endif()
+endif()
+
+# Check and set up common compiler flags.
+include(config-ix)
+include(HandleOpenMPOptions)
+
+# Set up testing infrastructure.
+include(OpenMPTesting)
+
+set(OPENMP_TEST_FLAGS "" CACHE STRING
+ "Extra compiler flags to send to the test compiler.")
+set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
+ "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
+
+
+# Build host runtime library.
+add_subdirectory(runtime)
+
+
+set(ENABLE_LIBOMPTARGET ON)
+# Currently libomptarget cannot be compiled on Windows or MacOS X.
+# Since the device plugins are only supported on Linux anyway,
+# there is no point in trying to compile libomptarget on other OSes.
+if (APPLE OR WIN32 OR NOT OPENMP_HAVE_STD_CPP11_FLAG)
+ set(ENABLE_LIBOMPTARGET OFF)
+endif()
+
+option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
+ ${ENABLE_LIBOMPTARGET})
+if (OPENMP_ENABLE_LIBOMPTARGET)
+ # Check that the library can acutally be built.
+ if (APPLE OR WIN32)
+ message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
+ elseif (NOT OPENMP_HAVE_STD_CPP11_FLAG)
+ message(FATAL_ERROR "Host compiler must support C++11 to build libomptarget!")
+ endif()
+
+ add_subdirectory(libomptarget)
+endif()
+
+# Now that we have seen all testuites, create the check-openmp target.
+construct_check_openmp_target()
diff --git a/final/CREDITS.txt b/final/CREDITS.txt
new file mode 100644
index 0000000..b14bb9a
--- /dev/null
+++ b/final/CREDITS.txt
@@ -0,0 +1,61 @@
+This file is a partial list of people who have contributed to the LLVM/openmp
+project. If you have contributed a patch or made some other contribution to
+LLVM/openmp, please submit a patch to this file to add yourself, and it will be
+done!
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts. The fields are: name (N), email (E), web-address
+(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
+(S).
+
+N: Adam Azarchs
+W: 10xgenomics.com
+D: Bug fix for lock code
+
+N: Carlo Bertolli
+W: http://ibm.com
+D: IBM contributor to PowerPC support in CMake files and elsewhere.
+
+N: Diego Caballero
+E: diego.l.caballero@gmail.com
+D: Fork performance improvements
+
+N: Sunita Chandrasekaran
+D: Contributor to testsuite from OpenUH
+
+N: Barbara Chapman
+D: Contributor to testsuite from OpenUH
+
+N: University of Houston
+W: http://web.cs.uh.edu/~openuh/download/
+D: OpenUH test suite
+
+N: Intel Corporation OpenMP runtime team
+W: http://openmprtl.org
+D: Created the runtime.
+
+N: John Mellor-Crummey and other members of the OpenMP Tools Working Group
+E: johnmc@rice.edu
+D: OpenMP Tools Interface (OMPT)
+
+N: Matthias Muller
+D: Contributor to testsuite from OpenUH
+
+N: Tal Nevo
+E: tal@scalemp.com
+D: ScaleMP contributor to improve runtime performance there.
+W: http://scalemp.com
+
+N: Pavel Neytchev
+D: Contributor to testsuite from OpenUH
+
+N: Steven Noonan
+E: steven@uplinklabs.net
+D: Patches for the ARM architecture and removal of several inconsistencies.
+
+N: Alp Toker
+E: alp@nuanti.com
+D: Making build work for FreeBSD.
+
+N: Cheng Wang
+D: Contributor to testsuite from OpenUH
diff --git a/final/LICENSE.txt b/final/LICENSE.txt
new file mode 100644
index 0000000..d858552
--- /dev/null
+++ b/final/LICENSE.txt
@@ -0,0 +1,174 @@
+==============================================================================
+
+The software contained in this directory tree is dual licensed under both the
+University of Illinois "BSD-Like" license and the MIT license. As a user of
+this code you may choose to use it under either license. As a contributor,
+you agree to allow your code to be used under both. The full text of the
+relevant licenses is included below.
+
+In addition, a license agreement from the copyright/patent holders of the
+software contained in this directory tree is included below.
+
+==============================================================================
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 1997-2016 Intel Corporation
+
+All rights reserved.
+
+Developed by:
+ OpenMP Runtime Team
+ Intel Corporation
+ http://www.openmprtl.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimers.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimers in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the names of Intel Corporation OpenMP Runtime Team nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this Software without specific prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+
+Copyright (c) 1997-2016 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+==============================================================================
+
+Intel Corporation
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, Intel Corporation ("Intel") reserves
+all right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by Intel to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+ Agreement, Intel hereby grants to you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable copyright license to reproduce, prepare derivative
+ works of, publicly display, publicly perform, sublicense, and distribute the
+ Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+ Agreement, Intel hereby grants you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable (except as stated in this section) patent license
+ to make, have made, use, offer to sell, sell, import, and otherwise transfer
+ the Work, where such license applies only to those patent claims licensable
+ by Intel that are necessarily infringed by Intel's Software alone or by
+ combination of the Software with the Work to which such Software was
+ submitted. If any entity institutes patent litigation against Intel or any
+ other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+ that Intel's Software, or the Work to which Intel has contributed constitutes
+ direct or contributory patent infringement, then any patent licenses granted
+ to that entity under this Agreement for the Software or Work shall terminate
+ as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
+
+==============================================================================
+
+ARM Limited
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, ARM Limited ("ARM") reserves all
+right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+ Agreement, ARM hereby grants to you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable copyright license to reproduce, prepare derivative
+ works of, publicly display, publicly perform, sublicense, and distribute the
+ Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+ Agreement, ARM hereby grants you and to recipients of the Software
+ distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+ royalty-free, irrevocable (except as stated in this section) patent license
+ to make, have made, use, offer to sell, sell, import, and otherwise transfer
+ the Work, where such license applies only to those patent claims licensable
+ by ARM that are necessarily infringed by ARM's Software alone or by
+ combination of the Software with the Work to which such Software was
+ submitted. If any entity institutes patent litigation against ARM or any
+ other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+ that ARM's Software, or the Work to which ARM has contributed constitutes
+ direct or contributory patent infringement, then any patent licenses granted
+ to that entity under this Agreement for the Software or Work shall terminate
+ as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
+
+==============================================================================
diff --git a/final/README.rst b/final/README.rst
new file mode 100644
index 0000000..63afb70
--- /dev/null
+++ b/final/README.rst
@@ -0,0 +1,343 @@
+========================================
+How to Build the LLVM* OpenMP* Libraries
+========================================
+This repository requires `CMake <http://www.cmake.org/>`_ v2.8.0 or later. LLVM
+and Clang need a more recent version which also applies for in-tree builds. For
+more information than available in this document please see
+`LLVM's CMake documentation <http://llvm.org/docs/CMake.html>`_ and the
+`official documentation <https://cmake.org/cmake/help/v2.8.0/cmake.html>`_.
+
+.. contents::
+ :local:
+
+How to Call CMake Initially, then Repeatedly
+============================================
+- When calling CMake for the first time, all needed compiler options must be
+ specified on the command line. After this initial call to CMake, the compiler
+ definitions must not be included for further calls to CMake. Other options
+ can be specified on the command line multiple times including all definitions
+ in the build options section below.
+- Example of configuring, building, reconfiguring, rebuilding:
+
+ .. code-block:: console
+
+ $ mkdir build
+ $ cd build
+ $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. # Initial configuration
+ $ make
+ ...
+ $ make clean
+ $ cmake -DCMAKE_BUILD_TYPE=Debug .. # Second configuration
+ $ make
+ ...
+ $ rm -rf *
+ $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .. # Third configuration
+ $ make
+
+- Notice in the example how the compiler definitions are only specified for an
+ empty build directory, but other build options are used at any time.
+- The file ``CMakeCache.txt`` which is created after the first call to CMake is
+ a configuration file which holds all values for the build options. These
+ values can be changed using a text editor to modify ``CMakeCache.txt`` as
+ opposed to using definitions on the command line.
+- To have CMake create a particular type of build generator file simply include
+ the ``-G <Generator name>`` option:
+
+ .. code-block:: console
+
+ $ cmake -G "Unix Makefiles" ...
+
+ You can see a list of generators CMake supports by executing the cmake command
+ with no arguments.
+
+Instructions to Build
+=====================
+.. code-block:: console
+
+ $ cd openmp_top_level/ [ this directory with libomptarget/, runtime/, etc. ]
+ $ mkdir build
+ $ cd build
+
+ [ Unix* Libraries ]
+ $ cmake -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> ..
+
+ [ Windows* Libraries ]
+ $ cmake -G <Generator Type> -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> -DCMAKE_ASM_MASM_COMPILER=[ml | ml64] -DCMAKE_BUILD_TYPE=Release ..
+
+ $ make
+ $ make install
+
+CMake Options
+=============
+Builds with CMake can be customized by means of options as already seen above.
+One possibility is to pass them via the command line:
+
+.. code-block:: console
+
+ $ cmake -DOPTION=<value> path/to/source
+
+.. note:: The first value listed is the respective default for that option.
+
+Generic Options
+---------------
+For full documentation consult the CMake manual or execute
+``cmake --help-variable VARIABLE_NAME`` to get information about a specific
+variable.
+
+**CMAKE_BUILD_TYPE** = ``Release|Debug|RelWithDebInfo``
+ Build type can be ``Release``, ``Debug``, or ``RelWithDebInfo`` which chooses
+ the optimization level and presence of debugging symbols.
+
+**CMAKE_C_COMPILER** = <C compiler name>
+ Specify the C compiler.
+
+**CMAKE_CXX_COMPILER** = <C++ compiler name>
+ Specify the C++ compiler.
+
+**CMAKE_Fortran_COMPILER** = <Fortran compiler name>
+ Specify the Fortran compiler. This option is only needed when
+ **LIBOMP_FORTRAN_MODULES** is ``ON`` (see below). So typically, a Fortran
+ compiler is not needed during the build.
+
+**CMAKE_ASM_MASM_COMPILER** = ``ml|ml64``
+ This option is only relevant for Windows*.
+
+Options for all Libraries
+-------------------------
+
+**OPENMP_ENABLE_WERROR** = ``OFF|ON``
+ Treat warnings as errors and fail, if a compiler warning is triggered.
+
+**OPENMP_LIBDIR_SUFFIX** = ``""``
+ Extra suffix to append to the directory where libraries are to be installed.
+
+**OPENMP_TEST_C_COMPILER** = ``${CMAKE_C_COMPILER}``
+ Compiler to use for testing. Defaults to the compiler that was also used for
+ building.
+
+**OPENMP_TEST_CXX_COMPILER** = ``${CMAKE_CXX_COMPILER}``
+ Compiler to use for testing. Defaults to the compiler that was also used for
+ building.
+
+**OPENMP_LLVM_TOOLS_DIR** = ``/path/to/built/llvm/tools``
+ Additional path to search for LLVM tools needed by tests.
+
+**OPENMP_LLVM_LIT_EXECUTABLE** = ``/path/to/llvm-lit``
+ Specify full path to ``llvm-lit`` executable for running tests. The default
+ is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
+
+**OPENMP_FILECHECK_EXECUTABLE** = ``/path/to/FileCheck``
+ Specify full path to ``FileCheck`` executable for running tests. The default
+ is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
+
+Options for ``libomp``
+----------------------
+
+**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64``
+ The default value for this option is chosen based on probing the compiler for
+ architecture macros (e.g., is ``__x86_64__`` predefined by compiler?).
+
+**LIBOMP_MIC_ARCH** = ``knc|knf``
+ Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) to
+ build for. This value is ignored if **LIBOMP_ARCH** does not equal ``mic``.
+
+**LIBOMP_OMP_VERSION** = ``50|45|40|30``
+ OpenMP version to build for. Older versions will disable certain
+ functionality and entry points.
+
+**LIBOMP_LIB_TYPE** = ``normal|profile|stubs``
+ Library type can be ``normal``, ``profile``, or ``stubs``.
+
+**LIBOMP_USE_VERSION_SYMBOLS** = ``ON|OFF``
+ Use versioned symbols for building the library. This option only makes sense
+ for ELF based libraries where version symbols are supported (Linux*, some BSD*
+ variants). It is ``OFF`` by default for Windows* and macOS*, but ``ON`` for
+ other Unix based operating systems.
+
+**LIBOMP_ENABLE_SHARED** = ``ON|OFF``
+ Build a shared library. If this option is ``OFF``, static OpenMP libraries
+ will be built instead of dynamic ones.
+
+ .. note::
+
+ Static libraries are not supported on Windows*.
+
+**LIBOMP_FORTRAN_MODULES** = ``OFF|ON``
+ Create the Fortran modules (requires Fortran compiler).
+
+macOS* Fat Libraries
+""""""""""""""""""""
+On macOS* machines, it is possible to build universal (or fat) libraries which
+include both i386 and x86_64 architecture objects in a single archive.
+
+.. code-block:: console
+
+ $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES='i386;x86_64' ..
+ $ make
+
+There is also an option **LIBOMP_OSX_ARCHITECTURES** which can be set in case
+this is an LLVM source tree build. It will only apply for the ``libomp`` library
+avoids having the entire LLVM/Clang build produce universal binaries.
+
+Optional Features
+"""""""""""""""""
+
+**LIBOMP_USE_ADAPTIVE_LOCKS** = ``ON|OFF``
+ Include adaptive locks, based on Intel(R) Transactional Synchronization
+ Extensions (Intel(R) TSX). This feature is x86 specific and turned ``ON``
+ by default for IA-32 architecture and Intel(R) 64 architecture.
+
+**LIBOMP_USE_INTERNODE_ALIGNMENT** = ``OFF|ON``
+ Align certain data structures on 4096-byte. This option is useful on
+ multi-node systems where a small ``CACHE_LINE`` setting leads to false sharing.
+
+**LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
+ Include support for the OpenMP Tools Interface (OMPT).
+ This option is supported and ``ON`` by default for x86, x86_64, AArch64, and
+ PPC64 on Linux* and macOS*.
+ This option is ``OFF`` if this feature is not supported for the platform.
+
+**LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
+ Include support for optional OMPT functionality. This option is ignored if
+ **LIBOMP_OMPT_SUPPORT** is ``OFF``.
+
+**LIBOMP_STATS** = ``OFF|ON``
+ Include stats-gathering code.
+
+**LIBOMP_USE_DEBUGGER** = ``OFF|ON``
+ Include the friendly debugger interface.
+
+**LIBOMP_USE_HWLOC** = ``OFF|ON``
+ Use `OpenMPI's hwloc library <https://www.open-mpi.org/projects/hwloc/>`_ for
+ topology detection and affinity.
+
+**LIBOMP_HWLOC_INSTALL_DIR** = ``/path/to/hwloc/install/dir``
+ Specify install location of hwloc. The configuration system will look for
+ ``hwloc.h`` in ``${LIBOMP_HWLOC_INSTALL_DIR}/include`` and the library in
+ ``${LIBOMP_HWLOC_INSTALL_DIR}/lib``. The default is ``/usr/local``.
+ This option is only used if **LIBOMP_USE_HWLOC** is ``ON``.
+
+Additional Compiler Flags
+"""""""""""""""""""""""""
+
+These flags are **appended**, they do not overwrite any of the preset flags.
+
+**LIBOMP_CPPFLAGS** = <space-separated flags>
+ Additional C preprocessor flags.
+
+**LIBOMP_CFLAGS** = <space-separated flags>
+ Additional C compiler flags.
+
+**LIBOMP_CXXFLAGS** = <space-separated flags>
+ Additional C++ compiler flags.
+
+**LIBOMP_ASMFLAGS** = <space-separated flags>
+ Additional assembler flags.
+
+**LIBOMP_LDFLAGS** = <space-separated flags>
+ Additional linker flags.
+
+**LIBOMP_LIBFLAGS** = <space-separated flags>
+ Additional libraries to link.
+
+**LIBOMP_FFLAGS** = <space-separated flags>
+ Additional Fortran compiler flags.
+
+Options for ``libomptarget``
+----------------------------
+
+**LIBOMPTARGET_OPENMP_HEADER_FOLDER** = ``""``
+ Path of the folder that contains ``omp.h``. This is required for testing
+ out-of-tree builds.
+
+**LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER** = ``""``
+ Path of the folder that contains ``libomp.so``. This is required for testing
+ out-of-tree builds.
+
+Options for ``NVPTX device RTL``
+--------------------------------
+
+**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF``
+ Enable CUDA LLVM bitcode offloading device RTL. This is used for link time
+ optimization of the OMP runtime and application code. This option is enabled
+ by default if the build system determines that `CMAKE_C_COMPILER` is able to
+ compile and link the library.
+
+**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""``
+ Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only
+ the Clang compiler is supported. This is only used when building the CUDA LLVM
+ bitcode offloading device RTL. If unspecified and the CMake C compiler is
+ Clang, then Clang is used.
+
+**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""``
+ Location of a linker capable of linking LLVM bitcode objects. This is only
+ used when building the CUDA LLVM bitcode offloading device RTL. If unspecified
+ and the CMake C compiler is Clang and there exists a llvm-link binary in the
+ directory containing Clang, then this llvm-link binary is used.
+
+**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""``
+ Host compiler to use with NVCC. This compiler is not going to be used to
+ produce any binary. Instead, this is used to overcome the input compiler
+ checks done by NVCC. E.g. if using a default host compiler that is not
+ compatible with NVCC, this option can be use to pass to NVCC a valid compiler
+ to avoid the error.
+
+ **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35``
+ List of CUDA compute capabilities that should be supported by the NVPTX
+ device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
+ should be used. Compute capability 3.5 is the minimum required.
+
+ **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
+ Enable printing of debug messages from the NVPTX device RTL.
+
+Example Usages of CMake
+=======================
+
+Typical Invocations
+-------------------
+
+.. code-block:: console
+
+ $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+ $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..
+ $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc ..
+
+Advanced Builds with Various Options
+------------------------------------
+
+- Build the i386 Linux* library using GCC*
+
+ .. code-block:: console
+
+ $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=i386 ..
+
+- Build the x86_64 debug Mac library using Clang*
+
+ .. code-block:: console
+
+ $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMP_ARCH=x86_64 -DCMAKE_BUILD_TYPE=Debug ..
+
+- Build the library (architecture determined by probing compiler) using the
+ Intel(R) C Compiler and the Intel(R) C++ Compiler. Also, create Fortran
+ modules with the Intel(R) Fortran Compiler.
+
+ .. code-block:: console
+
+ $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on ..
+
+- Have CMake find the C/C++ compiler and specify additional flags for the C
+ compiler, preprocessor, and C++ compiler.
+
+ .. code-blocks:: console
+
+ $ cmake -DLIBOMP_CFLAGS='-specific-flag' -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' ..
+
+- Build the stubs library
+
+ .. code-blocks:: console
+
+ $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_LIB_TYPE=stubs ..
+
+**Footnotes**
+
+.. [*] Other names and brands may be claimed as the property of others.
diff --git a/final/cmake/DetectTestCompiler/CMakeLists.txt b/final/cmake/DetectTestCompiler/CMakeLists.txt
new file mode 100644
index 0000000..c2f408c
--- /dev/null
+++ b/final/cmake/DetectTestCompiler/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 2.8)
+project(DetectTestCompiler C CXX)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+function(write_compiler_information lang)
+ set(information "${CMAKE_${lang}_COMPILER}")
+ set(information "${information}\\;${CMAKE_${lang}_COMPILER_ID}")
+ set(information "${information}\\;${CMAKE_${lang}_COMPILER_VERSION}")
+ set(information "${information}\\;${${lang}_FLAGS}")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${lang}CompilerInformation.txt ${information})
+endfunction(write_compiler_information)
+
+find_package(OpenMP)
+if (NOT OpenMP_Found)
+ set(OpenMP_C_FLAGS "-fopenmp")
+ set(OpenMP_CXX_FLAGS "-fopenmp")
+endif()
+
+set(C_FLAGS ${flags} ${OpenMP_C_FLAGS})
+set(CXX_FLAGS ${flags} ${OpenMP_CXX_FLAGS})
+
+# TODO: Implement blockaddress in GlobalISel and remove this flag!
+if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+ check_c_compiler_flag("-fno-experimental-isel" C_HAS_EXPERIMENTAL_ISEL_FLAG)
+ check_cxx_compiler_flag("-fno-experimental-isel" CXX_HAS_EXPERIMENTAL_ISEL_FLAG)
+ macro(add_experimental_isel_flag lang)
+ if (${lang}_HAS_EXPERIMENTAL_ISEL_FLAG)
+ set(${lang}_FLAGS "-fno-experimental-isel ${${lang}_FLAGS}")
+ endif()
+ endmacro(add_experimental_isel_flag)
+
+ add_experimental_isel_flag(C)
+ add_experimental_isel_flag(CXX)
+endif()
+
+write_compiler_information(C)
+write_compiler_information(CXX)
diff --git a/final/cmake/HandleOpenMPOptions.cmake b/final/cmake/HandleOpenMPOptions.cmake
new file mode 100644
index 0000000..5e5215d
--- /dev/null
+++ b/final/cmake/HandleOpenMPOptions.cmake
@@ -0,0 +1,16 @@
+if (${OPENMP_STANDALONE_BUILD})
+ # From HandleLLVMOptions.cmake
+ function(append_if condition value)
+ if (${condition})
+ foreach(variable ${ARGN})
+ set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
+ endforeach(variable)
+ endif()
+ endfunction()
+endif()
+
+if (${OPENMP_ENABLE_WERROR})
+ append_if(OPENMP_HAVE_WERROR_FLAG "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+append_if(OPENMP_HAVE_STD_CPP11_FLAG "-std=c++11" CMAKE_CXX_FLAGS) \ No newline at end of file
diff --git a/final/cmake/OpenMPTesting.cmake b/final/cmake/OpenMPTesting.cmake
new file mode 100644
index 0000000..1514d99
--- /dev/null
+++ b/final/cmake/OpenMPTesting.cmake
@@ -0,0 +1,181 @@
+# Keep track if we have all dependencies.
+set(ENABLE_CHECK_TARGETS TRUE)
+
+# Function to find required dependencies for testing.
+function(find_standalone_test_dependencies)
+ include(FindPythonInterp)
+
+ if (NOT PYTHONINTERP_FOUND)
+ message(STATUS "Could not find Python.")
+ message(WARNING "The check targets will not be available!")
+ set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+ return()
+ endif()
+
+ # Find executables.
+ find_program(OPENMP_LLVM_LIT_EXECUTABLE
+ NAMES llvm-lit lit.py lit
+ PATHS ${OPENMP_LLVM_TOOLS_DIR})
+ if (NOT OPENMP_LLVM_LIT_EXECUTABLE)
+ message(STATUS "Cannot find llvm-lit.")
+ message(STATUS "Please put llvm-lit in your PATH, set OPENMP_LLVM_LIT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+ message(WARNING "The check targets will not be available!")
+ set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+ return()
+ endif()
+
+ find_program(OPENMP_FILECHECK_EXECUTABLE
+ NAMES FileCheck
+ PATHS ${OPENMP_LLVM_TOOLS_DIR})
+ if (NOT OPENMP_FILECHECK_EXECUTABLE)
+ message(STATUS "Cannot find FileCheck.")
+ message(STATUS "Please put FileCheck in your PATH, set OPENMP_FILECHECK_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+ message(WARNING "The check targets will not be available!")
+ set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+ return()
+ endif()
+endfunction()
+
+if (${OPENMP_STANDALONE_BUILD})
+ find_standalone_test_dependencies()
+
+ # Make sure we can use the console pool for recent CMake and Ninja > 1.5.
+ if (CMAKE_VERSION VERSION_LESS 3.1.20141117)
+ set(cmake_3_2_USES_TERMINAL)
+ else()
+ set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
+ endif()
+
+ # Set lit arguments.
+ set(DEFAULT_LIT_ARGS "-sv --show-unsupported --show-xfail")
+ if (MSVC OR XCODE)
+ set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
+ endif()
+ set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
+ separate_arguments(OPENMP_LIT_ARGS)
+else()
+ set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
+endif()
+
+# Macro to extract information about compiler from file. (no own scope)
+macro(extract_test_compiler_information lang file)
+ file(READ ${file} information)
+ list(GET information 0 path)
+ list(GET information 1 id)
+ list(GET information 2 version)
+ list(GET information 3 openmp_flags)
+
+ set(OPENMP_TEST_${lang}_COMPILER_PATH ${path})
+ set(OPENMP_TEST_${lang}_COMPILER_ID ${id})
+ set(OPENMP_TEST_${lang}_COMPILER_VERSION ${version})
+ set(OPENMP_TEST_${lang}_COMPILER_OPENMP_FLAGS ${openmp_flags})
+endmacro()
+
+# Function to set variables with information about the test compiler.
+function(set_test_compiler_information dir)
+ extract_test_compiler_information(C ${dir}/CCompilerInformation.txt)
+ extract_test_compiler_information(CXX ${dir}/CXXCompilerInformation.txt)
+ if (NOT("${OPENMP_TEST_C_COMPILER_ID}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_ID}" AND
+ "${OPENMP_TEST_C_COMPILER_VERSION}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_VERSION}"))
+ message(STATUS "Test compilers for C and C++ don't match.")
+ message(WARNING "The check targets will not be available!")
+ set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+ else()
+ set(OPENMP_TEST_COMPILER_ID "${OPENMP_TEST_C_COMPILER_ID}" PARENT_SCOPE)
+ set(OPENMP_TEST_COMPILER_VERSION "${OPENMP_TEST_C_COMPILER_VERSION}" PARENT_SCOPE)
+ set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "${OPENMP_TEST_C_COMPILER_OPENMP_FLAGS}" PARENT_SCOPE)
+
+ # Determine major version.
+ string(REGEX MATCH "[0-9]+" major "${OPENMP_TEST_C_COMPILER_VERSION}")
+ string(REGEX MATCH "[0-9]+\\.[0-9]+" majorminor "${OPENMP_TEST_C_COMPILER_VERSION}")
+ set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${major}" PARENT_SCOPE)
+ set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${majorminor}" PARENT_SCOPE)
+ endif()
+endfunction()
+
+if (${OPENMP_STANDALONE_BUILD})
+ # Detect compiler that should be used for testing.
+ # We cannot use ExternalProject_Add() because its configuration runs when this
+ # project is built which is too late for detecting the compiler...
+ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler)
+ execute_process(
+ COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_LIST_DIR}/DetectTestCompiler
+ -DCMAKE_C_COMPILER=${OPENMP_TEST_C_COMPILER}
+ -DCMAKE_CXX_COMPILER=${OPENMP_TEST_CXX_COMPILER}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler
+ OUTPUT_VARIABLE DETECT_COMPILER_OUT
+ ERROR_VARIABLE DETECT_COMPILER_ERR
+ RESULT_VARIABLE DETECT_COMPILER_RESULT)
+ if (DETECT_COMPILER_RESULT)
+ message(STATUS "Could not detect test compilers.")
+ message(WARNING "The check targets will not be available!")
+ set(ENABLE_CHECK_TARGETS FALSE)
+ else()
+ set_test_compiler_information(${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler)
+ endif()
+else()
+ # Set the information that we know.
+ set(OPENMP_TEST_COMPILER_ID "Clang")
+ # Cannot use CLANG_VERSION because we are not guaranteed that this is already set.
+ set(OPENMP_TEST_COMPILER_VERSION "${LLVM_VERSION}")
+ set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_MAJOR_VERSION}")
+ set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}")
+ # TODO: Implement blockaddress in GlobalISel and remove this flag!
+ set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "-fopenmp -fno-experimental-isel")
+endif()
+
+# Function to set compiler features for use in lit.
+function(set_test_compiler_features)
+ if ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "GNU")
+ set(comp "gcc")
+ elseif ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "Intel")
+ set(comp "icc")
+ else()
+ # Just use the lowercase of the compiler ID as fallback.
+ string(TOLOWER "${OPENMP_TEST_COMPILER_ID}" comp)
+ endif()
+ set(OPENMP_TEST_COMPILER_FEATURES "['${comp}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION}']" PARENT_SCOPE)
+endfunction()
+set_test_compiler_features()
+
+# Function to add a testsuite for an OpenMP runtime library.
+function(add_openmp_testsuite target comment)
+ if (NOT ENABLE_CHECK_TARGETS)
+ add_custom_target(${target}
+ COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, dependencies not found.")
+ message(STATUS "${target} does nothing.")
+ return()
+ endif()
+
+ cmake_parse_arguments(ARG "" "" "DEPENDS" ${ARGN})
+ # EXCLUDE_FROM_ALL excludes the test ${target} out of check-openmp.
+ if (NOT EXCLUDE_FROM_ALL)
+ # Register the testsuites and depends for the check-openmp rule.
+ set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_TESTSUITES ${ARG_UNPARSED_ARGUMENTS})
+ set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_DEPENDS ${ARG_DEPENDS})
+ endif()
+
+ if (${OPENMP_STANDALONE_BUILD})
+ add_custom_target(${target}
+ COMMAND ${PYTHON_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE} ${OPENMP_LIT_ARGS} ${ARG_UNPARSED_ARGUMENTS}
+ COMMENT ${comment}
+ DEPENDS ${ARG_DEPENDS}
+ ${cmake_3_2_USES_TERMINAL}
+ )
+ else()
+ add_lit_testsuite(${target}
+ ${comment}
+ ${ARG_UNPARSED_ARGUMENTS}
+ DEPENDS clang clang-headers FileCheck ${ARG_DEPENDS}
+ )
+ endif()
+endfunction()
+
+function(construct_check_openmp_target)
+ get_property(OPENMP_LIT_TESTSUITES GLOBAL PROPERTY OPENMP_LIT_TESTSUITES)
+ get_property(OPENMP_LIT_DEPENDS GLOBAL PROPERTY OPENMP_LIT_DEPENDS)
+
+ # We already added the testsuites themselves, no need to do that again.
+ set(EXCLUDE_FROM_ALL True)
+ add_openmp_testsuite(check-openmp "Running OpenMP tests" ${OPENMP_LIT_TESTSUITES} DEPENDS ${OPENMP_LIT_DEPENDS})
+endfunction()
diff --git a/final/cmake/config-ix.cmake b/final/cmake/config-ix.cmake
new file mode 100644
index 0000000..912cbd0
--- /dev/null
+++ b/final/cmake/config-ix.cmake
@@ -0,0 +1,6 @@
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+check_c_compiler_flag(-Werror OPENMP_HAVE_WERROR_FLAG)
+
+check_cxx_compiler_flag(-std=c++11 OPENMP_HAVE_STD_CPP11_FLAG) \ No newline at end of file
diff --git a/final/libomptarget/CMakeLists.txt b/final/libomptarget/CMakeLists.txt
new file mode 100644
index 0000000..3d9c78a
--- /dev/null
+++ b/final/libomptarget/CMakeLists.txt
@@ -0,0 +1,73 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library and related plugins.
+#
+##===----------------------------------------------------------------------===##
+
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+ message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
+endif()
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
+
+if(OPENMP_STANDALONE_BUILD)
+ # Build all libraries into a common place so that tests can find them.
+ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Message utilities.
+include(LibomptargetUtils)
+
+# Get dependencies for the different components of the project.
+include(LibomptargetGetDependencies)
+
+# This is a list of all the targets that are supported/tested right now.
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
+
+# Once the plugins for the different targets are validated, they will be added to
+# the list of supported targets in the current system.
+set (LIBOMPTARGET_SYSTEM_TARGETS "")
+
+# Set base directories - required for lit to locate the tests.
+set(LIBOMPTARGET_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(LIBOMPTARGET_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# If building this library in debug mode, we define a macro to enable
+# dumping progress messages at runtime.
+string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+ add_definitions(-DOMPTARGET_DEBUG)
+ add_definitions(-g)
+ add_definitions(-O0)
+endif()
+
+include_directories(include)
+
+# Build target agnostic offloading library.
+add_subdirectory(src)
+
+# Retrieve the path to the resulting library so that it can be used for
+# testing.
+get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
+if(NOT LIBOMPTARGET_LIBRARY_DIR)
+ set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Build offloading plugins and device RTLs if they are available.
+add_subdirectory(plugins)
+add_subdirectory(deviceRTLs)
+
+# Add tests.
+add_subdirectory(test)
diff --git a/final/libomptarget/README.txt b/final/libomptarget/README.txt
new file mode 100644
index 0000000..8c0a837
--- /dev/null
+++ b/final/libomptarget/README.txt
@@ -0,0 +1,73 @@
+
+ README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+ ======================================================================
+
+How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+========================================================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omptarget
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget)
+$ cd where-you-want-to-live/openmp/libomptarget
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst in the parent directory.
+
+Architectures Supported
+=======================
+The current library has been only tested in Linux operating system and the
+following host architectures:
+* Intel(R) 64 architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* ARM(R) AArch64 architecture (little endian)
+
+The currently supported offloading device architectures are:
+* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
+
+Supported RTL Build Configurations
+==================================
+Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8
+
+ ---------------------------
+ | gcc | clang |
+--------------|------------|------------|
+| Linux* OS | Yes(1) | Yes(2) |
+-----------------------------------------
+
+(1) gcc version 4.8.2 or later is supported.
+(2) clang version 3.7 or later is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL:
+ - clang (from https://github.com/clang-ykt )
+ - clang (development branch at http://clang.llvm.org - several features still
+ under development)
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+This library and related compiler support is still under development, so the
+employed interface is likely to change in the future.
+
+*Other names and brands may be claimed as the property of others.
diff --git a/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
new file mode 100644
index 0000000..71a6a5e
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -0,0 +1,160 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Try to detect in the system several dependencies required by the different
+# components of libomptarget. These are the dependencies we have:
+#
+# libelf : required by some targets to handle the ELF files at runtime.
+# libffi : required to launch target kernels given function and argument
+# pointers.
+# CUDA : required to control offloading to NVIDIA GPUs.
+
+include (FindPackageHandleStandardArgs)
+
+################################################################################
+# Looking for libelf...
+################################################################################
+
+find_path (
+ LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR
+ NAMES
+ libelf.h
+ PATHS
+ /usr/include
+ /usr/local/include
+ /opt/local/include
+ /sw/include
+ ENV CPATH
+ PATH_SUFFIXES
+ libelf)
+
+find_library (
+ LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+ NAMES
+ elf
+ PATHS
+ /usr/lib
+ /usr/local/lib
+ /opt/local/lib
+ /sw/lib
+ ENV LIBRARY_PATH
+ ENV LD_LIBRARY_PATH)
+
+set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+find_package_handle_standard_args(
+ LIBOMPTARGET_DEP_LIBELF
+ DEFAULT_MSG
+ LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+ LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
+
+mark_as_advanced(
+ LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS
+ LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
+
+################################################################################
+# Looking for libffi...
+################################################################################
+find_package(PkgConfig)
+
+pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi)
+
+find_path (
+ LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR
+ NAMES
+ ffi.h
+ HINTS
+ ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR}
+ ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS}
+ PATHS
+ /usr/include
+ /usr/local/include
+ /opt/local/include
+ /sw/include
+ ENV CPATH)
+
+# Don't bother look for the library if the header files were not found.
+if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR)
+ find_library (
+ LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+ NAMES
+ ffi
+ HINTS
+ ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR}
+ ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS}
+ PATHS
+ /usr/lib
+ /usr/local/lib
+ /opt/local/lib
+ /sw/lib
+ ENV LIBRARY_PATH
+ ENV LD_LIBRARY_PATH)
+endif()
+
+set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+find_package_handle_standard_args(
+ LIBOMPTARGET_DEP_LIBFFI
+ DEFAULT_MSG
+ LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+ LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
+
+mark_as_advanced(
+ LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS
+ LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
+
+################################################################################
+# Looking for CUDA...
+################################################################################
+find_package(CUDA QUIET)
+
+set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
+set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+
+mark_as_advanced(
+ LIBOMPTARGET_DEP_CUDA_FOUND
+ LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS)
+
+################################################################################
+# Looking for CUDA Driver API... (needed for CUDA plugin)
+################################################################################
+
+find_library (
+ LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+ NAMES
+ cuda
+ PATHS
+ /lib64)
+
+# There is a libcuda.so in lib64/stubs that can be used for linking.
+if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
+ # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
+ # case CUDA_LIBRARIES contains additional linker arguments which breaks
+ # get_filename_component below. Fortunately, since that change the module
+ # exports CUDA_cudart_static_LIBRARY which points to a single file in the
+ # right directory.
+ set(cuda_library ${CUDA_LIBRARIES})
+ if (DEFINED CUDA_cudart_static_LIBRARY)
+ set(cuda_library ${CUDA_cudart_static_LIBRARY})
+ endif()
+ get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
+ find_library (
+ LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+ NAMES
+ cuda
+ HINTS
+ "${CUDA_LIBDIR}/stubs")
+endif()
+
+find_package_handle_standard_args(
+ LIBOMPTARGET_DEP_CUDA_DRIVER
+ DEFAULT_MSG
+ LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
+
+mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
diff --git a/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
new file mode 100644
index 0000000..5c69340
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -0,0 +1,112 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# We use the compiler and linker provided by the user, attempt to use the one
+# used to build libomptarget or just fail.
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+ set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+ set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+else()
+ return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+ set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS "${llvm_link}")
+ # Use llvm-link from the compiler directory.
+ set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
+else()
+ return()
+endif()
+
+function(try_compile_bitcode output source)
+ set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
+ file(WRITE ${srcfile} "${source}\n")
+ set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
+
+ # The remaining arguments are the flags to be tested.
+ # FIXME: Don't hardcode GPU version. This is currently required because
+ # Clang refuses to compile its default of sm_20 with CUDA 9.
+ execute_process(
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
+ --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
+ RESULT_VARIABLE result
+ OUTPUT_QUIET ERROR_QUIET)
+ if (result EQUAL 0)
+ set(${output} TRUE PARENT_SCOPE)
+ else()
+ set(${output} FALSE PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Save for which compiler we are going to do the following checks so that we
+# can discard cached values if the user specifies a different value.
+set(discard_cached FALSE)
+if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
+ NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
+ set(discard_cached TRUE)
+endif()
+set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
+
+function(check_bitcode_compilation output source)
+ if (${discard_cached} OR NOT DEFINED ${output})
+ message(STATUS "Performing Test ${output}")
+ # Forward additional arguments which contain the flags.
+ try_compile_bitcode(result "${source}" ${ARGN})
+ set(${output} ${result} CACHE INTERNAL "" FORCE)
+ if(${result})
+ message(STATUS "Performing Test ${output} - Success")
+ else()
+ message(STATUS "Performing Test ${output} - Failed")
+ endif()
+ endif()
+endfunction()
+
+# These flags are required to emit LLVM Bitcode. We check them together because
+# if any of them are not supported, there is no point in finding out which are.
+set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
+
+# It makes no sense to continue given that the compiler doesn't support
+# emitting basic LLVM Bitcode
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+ return()
+endif()
+
+set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
+
+# Declaring external shared device variables might need an additional flag
+# since Clang 7.0 and was entirely unsupported since version 4.0.
+set(extern_device_shared_src "extern __device__ __shared__ int test;")
+
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
+ set(compiler_flag_fcuda_rdc -fcuda-rdc)
+ set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
+ check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
+
+ if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+ return()
+ endif()
+
+ set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
+endif()
+
+# We can compile LLVM Bitcode from CUDA source code!
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
diff --git a/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake
new file mode 100644
index 0000000..d964903
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake
@@ -0,0 +1,28 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomptarget_say(string message_to_user);
+# - prints out message_to_user
+macro(libomptarget_say message_to_user)
+ message(STATUS "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomptarget_warning_say message_to_user)
+ message(WARNING "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomptarget_error_say message_to_user)
+ message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
+endmacro()
diff --git a/final/libomptarget/deviceRTLs/CMakeLists.txt b/final/libomptarget/deviceRTLs/CMakeLists.txt
new file mode 100644
index 0000000..7c75387
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/CMakeLists.txt
@@ -0,0 +1,14 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+# ##===----------------------------------------------------------------------===##
+#
+# Build a device RTL for each available machine available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(nvptx)
diff --git a/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
new file mode 100644
index 0000000..4fc9ef0
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -0,0 +1,181 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
+#
+##===----------------------------------------------------------------------===##
+
+set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
+ "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
+
+if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
+ find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
+ if(NOT ALTERNATE_CUDA_HOST_COMPILER)
+ libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
+ endif()
+ set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
+endif()
+
+# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
+# gcc.
+if(CUDA_HOST_COMPILER MATCHES clang)
+
+ find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
+
+ if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
+ libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
+ libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
+ return()
+ endif()
+ set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
+endif()
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+ libomptarget_say("Building CUDA offloading device RTL.")
+
+ # We really don't have any host code, so we don't need to care about
+ # propagating host flags.
+ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+ set(cuda_src_files
+ src/cancel.cu
+ src/critical.cu
+ src/data_sharing.cu
+ src/libcall.cu
+ src/loop.cu
+ src/omptarget-nvptx.cu
+ src/parallel.cu
+ src/reduction.cu
+ src/sync.cu
+ src/task.cu
+ )
+
+ set(omp_data_objects src/omp_data.cu)
+
+ # Get the compute capability the user requested or use SM_35 by default.
+ # SM_35 is what clang uses by default.
+ set(default_capabilities 35)
+ if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
+ set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+ libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
+ endif()
+ set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
+ "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+ string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+
+ foreach(sm ${nvptx_sm_list})
+ set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
+ endforeach()
+
+ # Activate RTL message dumps if requested by the user.
+ set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
+ "Activate NVPTX device RTL debug messages.")
+ if(${LIBOMPTARGET_NVPTX_DEBUG})
+ set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
+ endif()
+
+ # NVPTX runtime library has to be statically linked. Dynamic linking is not
+ # yet supported by the CUDA toolchain on the device.
+ set(BUILD_SHARED_LIBS OFF)
+ set(CUDA_SEPARABLE_COMPILATION ON)
+
+ cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
+ OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+
+ # Install device RTL under the lib destination folder.
+ install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+ target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
+
+
+ # Check if we can create an LLVM bitcode implementation of the runtime library
+ # that could be inlined in the user application. For that we need to find
+ # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+ # an LLVM linker.
+ set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+ "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+ set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+ "Location of a linker capable of linking LLVM bitcode objects.")
+
+ include(LibomptargetNVPTXBitcodeLibrary)
+
+ set(bclib_default FALSE)
+ if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+ set(bclib_default TRUE)
+ endif()
+ set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
+ "Enable CUDA LLVM bitcode offloading device RTL.")
+ if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
+ if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+ libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
+ endif()
+ libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+ # Set flags for LLVM Bitcode compilation.
+ set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} -DOMPTARGET_NVPTX_TEST=0)
+ if(${LIBOMPTARGET_NVPTX_DEBUG})
+ set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+ else()
+ set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+ endif()
+
+ # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
+ # to handle. Therefore, we use 'weak' instead. We are compiling only for the
+ # device, so it should be equivalent.
+ if(CUDA_VERSION_MAJOR GREATER 8)
+ set(bc_flags ${bc_flags} -Dnv_weak=weak)
+ endif()
+
+ # Generate a Bitcode library for all the compute capabilities the user requested.
+ foreach(sm ${nvptx_sm_list})
+ set(cuda_arch --cuda-gpu-arch=sm_${sm})
+
+ # Compile CUDA files to bitcode.
+ set(bc_files "")
+ foreach(src ${cuda_src_files})
+ get_filename_component(infile ${src} ABSOLUTE)
+ get_filename_component(outfile ${src} NAME)
+
+ add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+ -c ${infile} -o ${outfile}-sm_${sm}.bc
+ DEPENDS ${infile}
+ IMPLICIT_DEPENDS CXX ${infile}
+ COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
+ VERBATIM
+ )
+ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+
+ list(APPEND bc_files ${outfile}-sm_${sm}.bc)
+ endforeach()
+
+ # Link to a bitcode library.
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+ -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
+ DEPENDS ${bc_files}
+ COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
+ )
+ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
+
+ add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+
+ # Copy library to destination.
+ add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+ $<TARGET_FILE_DIR:omptarget-nvptx>)
+
+ # Install bitcode library under the lib destination folder.
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+ endforeach()
+ endif()
+
+else()
+ libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+endif()
diff --git a/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
new file mode 100644
index 0000000..989a01f
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@@ -0,0 +1,523 @@
+
+**Design document for OpenMP reductions on the GPU**
+
+//Abstract: //In this document we summarize the new design for an OpenMP
+implementation of reductions on NVIDIA GPUs. This document comprises
+* a succinct background review,
+* an introduction to the decoupling of reduction algorithm and
+ data-structure-specific processing routines,
+* detailed illustrations of reduction algorithms used and
+* a brief overview of steps we have made beyond the last implementation.
+
+**Problem Review**
+
+Consider a typical OpenMP program with reduction pragma.
+
+```
+ double foo, bar;
+ #pragma omp parallel for reduction(+:foo, bar)
+ for (int i = 0; i < N; i++) {
+ foo+=A[i]; bar+=B[i];
+ }
+```
+where 'foo' and 'bar' are reduced across all threads in the parallel region.
+Our primary goal is to efficiently aggregate the values of foo and bar in
+such manner that
+* makes the compiler logically concise.
+* efficiently reduces within warps, threads, blocks and the device.
+
+**Introduction to Decoupling**
+In this section we address the problem of making the compiler
+//logically concise// by partitioning the task of reduction into two broad
+categories: data-structure specific routines and algorithmic routines.
+
+The previous reduction implementation was highly coupled with
+the specificity of the reduction element data structures (e.g., sizes, data
+types) and operators of the reduction (e.g., addition, multiplication). In
+our implementation we strive to decouple them. In our final implementations,
+we could remove all template functions in our runtime system.
+
+The (simplified) pseudo code generated by LLVM is as follows:
+
+```
+ 1. Create private copies of variables: foo_p, bar_p
+ 2. Each thread reduces the chunk of A and B assigned to it and writes
+ to foo_p and bar_p respectively.
+ 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn,
+ interWarpCpyFn)
+ where:
+ struct ReduceData {
+ double *foo;
+ double *bar;
+ } reduceData
+ reduceData.foo = &foo_p
+ reduceData.bar = &bar_p
+
+ shuffleReduceFn and interWarpCpyFn are two auxiliary functions
+ generated to aid the runtime performing algorithmic steps
+ while being data-structure agnostic about ReduceData.
+
+ In particular, shuffleReduceFn is a function that takes the following
+ inputs:
+ a. local copy of ReduceData
+ b. its lane_id
+ c. the offset of the lane_id which hosts a remote ReduceData
+ relative to the current one
+ d. an algorithm version paramter determining which reduction
+ algorithm to use.
+ This shuffleReduceFn retrieves the remote ReduceData through shuffle
+ intrinsics and reduces, using the algorithm specified by the 4th
+ parameter, the local ReduceData and with the remote ReduceData element
+ wise, and places the resultant values into the local ReduceData.
+
+ Different reduction algorithms are implemented with different runtime
+ functions, but they all make calls to this same shuffleReduceFn to
+ perform the essential reduction step. Therefore, based on the 4th
+ parameter, this shuffleReduceFn will behave slightly differently to
+ cooperate with the runtime function to ensure correctness under
+ different circumstances.
+
+ InterWarpCpyFn, as the name suggests, is a function that copies data
+ across warps. Its function is to tunnel all the thread private
+ ReduceData that is already reduced within a warp to a lane in the first
+ warp with minimal shared memory footprint. This is an essential step to
+ prepare for the last step of a block reduction.
+
+ (Warp, block, device level reduction routines that utilize these
+ auxiliary functions will be discussed in the next section.)
+
+ 4. if ret == 1:
+ The master thread stores the reduced result in the globals.
+ foo += reduceData.foo; bar += reduceData.bar
+```
+
+**Reduction Algorithms**
+
+On the warp level, we have three versions of the algorithms:
+
+1. Full Warp Reduction
+
+```
+gpu_regular_warp_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+ ShuffleReduceFn(reduce_data, 0, offset, 0);
+}
+```
+ShuffleReduceFn is used here with lane_id set to 0 because it is not used
+therefore we save instructions by not retrieving lane_id from the corresponding
+special registers. The 4th parameters, which represents the version of the
+algorithm being used here, is set to 0 to signify full warp reduction.
+
+In this version specified (=0), the ShuffleReduceFn behaves, per element, as
+follows:
+
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+reduce_elem = reduce_elem @ remote_elem;
+
+```
+
+An illustration of this algorithm operating on a hypothetical 8-lane full-warp
+would be:
+{F74}
+The coloring invariant follows that elements with the same color will be
+combined and reduced in the next reduction step. As can be observed, no overhead
+is present, exactly log(2, N) steps are needed.
+
+2. Contiguous Full Warp Reduction
+```
+gpu_irregular_warp_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
+ int lane_id) {
+ int curr_size;
+ int offset;
+ curr_size = size;
+ mask = curr_size/2;
+ while (offset>0) {
+ ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+ curr_size = (curr_size+1)/2;
+ offset = curr_size/2;
+ }
+}
+```
+
+In this version specified (=1), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (lane_id < offset) {
+ reduce_elem = reduce_elem @ remote_elem
+} else {
+ reduce_elem = remote_elem
+}
+```
+
+An important invariant (also a restriction on the starting state of the
+reduction) is that this algorithm assumes that all unused ReduceData are
+located in a contiguous subset of threads in a warp starting from lane 0.
+
+With the presence of a trailing active lane with an odd-numbered lane
+id, its value will not be aggregated with any other lane. Therefore,
+in order to preserve the invariant, such ReduceData is copied to the first lane
+whose thread-local ReduceData has already being used in a previous reduction
+and would therefore be useless otherwise.
+
+An illustration of this algorithm operating on a hypothetical 8-lane partial
+warp woud be:
+{F75}
+
+As illustrated, this version of the algorithm introduces overhead whenever
+we have odd number of participating lanes in any reduction step to
+copy data between lanes.
+
+3. Dispersed Partial Warp Reduction
+```
+gpu_irregular_simt_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+ int size, remote_id;
+ int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
+ do {
+ remote_id = find_the_next_active_lane_id_right_after_me();
+ // the above function returns 0 of no active lane
+ // is present right after the current thread.
+ size = get_number_of_active_lanes_in_this_warp();
+ logical_lane_id /= 2;
+ ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
+ } while (logical_lane_id % 2 == 0 && size > 1);
+```
+
+There is no assumption made about the initial state of the reduction.
+Any number of lanes (>=1) could be active at any position. The reduction
+result is kept in the first active lane.
+
+In this version specified (=2), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (LaneId % 2 == 0 && Offset > 0) {
+ reduce_elem = reduce_elem @ remote_elem
+} else {
+ reduce_elem = remote_elem
+}
+```
+We will proceed with a brief explanation for some arguments passed in,
+it is important to notice that, in this section, we will introduce the
+concept of logical_lane_id, and it is important to distinguish it
+from physical lane_id as defined by nvidia.
+1. //logical_lane_id//: as the name suggests, it refers to the calculated
+ lane_id (instead of the physical one defined by nvidia) that would make
+ our algorithm logically concise. A thread with logical_lane_id k means
+ there are (k-1) threads before it.
+2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
+ id of the remote lane from which we will retrieve the ReduceData. We
+ subtract (threadIdx+1) from it because we would like to maintain only one
+ underlying shuffle intrinsic (which is used to communicate among lanes in a
+ warp). This particular version of shuffle intrinsic we take accepts only
+ offsets, instead of absolute lane_id. Therefore the subtraction is performed
+ on the absolute lane_id we calculated to obtain the offset.
+
+This algorithm is slightly different in 2 ways and it is not, conceptually, a
+generalization of the above algorithms.
+1. It reduces elements close to each other. For instance, values in the 0th lane
+ is to be combined with that of the 1st lane; values in the 2nd lane is to be
+ combined with that of the 3rd lane. We did not use the previous algorithm
+ where the first half of the (partial) warp is reduced with the second half
+ of the (partial) warp. This is because, the mapping
+ f(x): logical_lane_id -> physical_lane_id;
+ can be easily calculated whereas its inverse
+ f^-1(x): physical_lane_id -> logical_lane_id
+ cannot and performing such reduction requires the inverse to be known.
+2. Because this algorithm is agnostic about the positions of the lanes that are
+ active, we do not need to perform the coping step as in the second
+ algorithm.
+An illustrative run would look like
+{F76}
+As observed, overhead is high because in each and every step of reduction,
+logical_lane_id is recalculated; so is the remote_id.
+
+On a block level, we have implemented the following block reduce algorithm:
+
+```
+gpu_irregular_block_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr shuflReduceFn,
+ kmp_InterWarpCopyFctPtr interWarpCpyFn,
+ int size) {
+
+ int wid = threadIdx.x/WARPSIZE;
+ int lane_id = threadIdx.x%WARPSIZE;
+
+ int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
+
+ unsigned tnum = __ballot(1);
+ int thread_num = __popc(tnum);
+
+ //full warp reduction
+ if (thread_num == WARPSIZE) {
+ gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
+ }
+ //partial warp reduction
+ if (thread_num < WARPSIZE) {
+ gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
+ lane_id);
+ }
+ //Gather all the reduced values from each warp
+ //to the first warp
+ //named_barrier inside this function to ensure
+ //correctness. It is effectively a sync_thread
+ //that won't deadlock.
+ interWarpCpyFn(reduce_data, warp_needed);
+
+ //This is to reduce data gathered from each "warp master".
+ if (wid==0) {
+ gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
+ lane_id);
+ }
+
+ return;
+}
+```
+In this function, no ShuffleReduceFn is directly called as it makes calls
+to various versions of the warp-reduction functions. It first reduces
+ReduceData warp by warp; in the end, we end up with the number of
+ReduceData equal to the number of warps present in this thread
+block. We then proceed to gather all such ReduceData to the first warp.
+
+As observed, in this algorithm we make use of the function InterWarpCpyFn,
+which copies data from each of the "warp master" (0th lane of each warp, where
+a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
+mathematical sense) the problem of reduction across warp masters in a block to
+the problem of warp reduction which we already have solutions to.
+
+We can thus completely avoid the use of atomics to reduce in a threadblock.
+
+**Efficient Cross Block Reduce**
+
+The next challenge is to reduce values across threadblocks. We aim to do this
+without atomics or critical sections.
+
+Let a kernel be started with TB threadblocks.
+Let the GPU have S SMs.
+There can be at most N active threadblocks per SM at any time.
+
+Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of
+at most 'N' active threadblocks on SM s. Let each threadblock active on an SM
+be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id)
+uniquely identifies an active threadblock on the GPU.
+
+To efficiently implement cross block reduce, we first allocate an array for
+each value to be reduced of size S*N (which is the maximum number of active
+threadblocks at any time on the device).
+
+Each threadblock reduces its value to slot [s][id]. This can be done without
+locking since no other threadblock can write to the same slot concurrently.
+
+As a final stage, we reduce the values in the array as follows:
+
+```
+// Compiler generated wrapper function for each target region with a reduction
+clause.
+target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1
+ thread.
+ // Use dynamic parallelism to launch M teams, N threads as requested by the
+ user to execute the target region.
+
+ target_function<<M, N>>(map_args)
+
+ Reduce values in reduction_array
+
+```
+
+**Comparison with Last Version**
+
+
+The (simplified) pseudo code generated by LLVM on the host is as follows:
+
+
+```
+ 1. Create private copies of variables: foo_p, bar_p
+ 2. Each thread reduces the chunk of A and B assigned to it and writes
+ to foo_p and bar_p respectively.
+ 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+ where:
+ struct ReduceData {
+ double *foo;
+ double *bar;
+ } reduceData
+ reduceData.foo = &foo_p
+ reduceData.bar = &bar_p
+
+ reduceFn is a pointer to a function that takes in two inputs
+ of type ReduceData, "reduces" them element wise, and places the
+ result in the first input:
+ reduceFn(ReduceData *a, ReduceData *b)
+ a = a @ b
+
+ Every thread in the parallel region calls kmpc_reduce_nowait with
+ its private copy of reduceData. The runtime reduces across the
+ threads (using tree reduction on the operator 'reduceFn?) and stores
+ the final result in the master thread if successful.
+ 4. if ret == 1:
+ The master thread stores the reduced result in the globals.
+ foo += reduceData.foo; bar += reduceData.bar
+ 5. else if ret == 2:
+ In this case kmpc_reduce_nowait() could not use tree reduction,
+ so use atomics instead:
+ each thread atomically writes to foo
+ each thread atomically writes to bar
+```
+
+On a GPU, a similar reduction may need to be performed across SIMT threads,
+warps, and threadblocks. The challenge is to do so efficiently in a fashion
+that is compatible with the LLVM OpenMP implementation.
+
+In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
+the salient steps of the code generated are as follows:
+
+
+```
+ 1. Create private copies of variables: foo_p, bar_p
+ 2. Each thread reduces the chunk of A and B assigned to it and writes
+ to foo_p and bar_p respectively.
+ 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+ status = can_block_reduce()
+ if status == 1:
+ reduce efficiently to thread 0 using shuffles and shared memory.
+ return 1
+ else
+ cannot use efficient block reduction, fallback to atomics
+ return 2
+ 4. if ret == 1:
+ The master thread stores the reduced result in the globals.
+ foo += reduceData.foo; bar += reduceData.bar
+ 5. else if ret == 2:
+ In this case kmpc_reduce_nowait() could not use tree reduction,
+ so use atomics instead:
+ each thread atomically writes to foo
+ each thread atomically writes to bar
+```
+
+The function can_block_reduce() is defined as follows:
+
+
+```
+int32_t can_block_reduce() {
+ int tid = GetThreadIdInTeam();
+ int nt = GetNumberOfOmpThreads(tid);
+ if (nt != blockDim.x)
+ return 0;
+ unsigned tnum = __ballot(1);
+ if (tnum != (~0x0)) {
+ return 0;
+ }
+ return 1;
+}
+```
+
+This function permits the use of the efficient block reduction algorithm
+using shuffles and shared memory (return 1) only if (a) all SIMT threads in
+a warp are active (i.e., number of threads in the parallel region is a
+multiple of 32) and (b) the number of threads in the parallel region
+(set by the num_threads clause) equals blockDim.x.
+
+If either of these preconditions is not true, each thread in the threadblock
+updates the global value using atomics.
+
+Atomics and compare-and-swap operations are expensive on many threaded
+architectures such as GPUs and we must avoid them completely.
+
+
+**Appendix: Implementation Details**
+
+
+```
+// Compiler generated function.
+reduceFn(ReduceData *a, ReduceData *b)
+ a->foo = a->foo + b->foo
+ a->bar = a->bar + b->bar
+
+// Compiler generated function.
+swapAndReduceFn(ReduceData *thread_private, int lane)
+ ReduceData *remote = new ReduceData()
+ remote->foo = shuffle_double(thread_private->foo, lane)
+ remote->bar = shuffle_double(thread_private->bar, lane)
+ reduceFn(thread_private, remote)
+
+// OMP runtime function.
+warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
+ offset = 16
+ while (offset > 0)
+ swapAndReduceFn(thread_private, offset)
+ offset /= 2
+
+// OMP runtime function.
+warpReduce_irregular():
+ ...
+
+// OMP runtime function.
+kmpc_reduce_warp(reduceData, swapAndReduceFn)
+ if all_lanes_active:
+ warpReduce_regular(reduceData, swapAndReduceFn)
+ else:
+ warpReduce_irregular(reduceData, swapAndReduceFn)
+ if in_simd_region:
+ // all done, reduce to global in simd lane 0
+ return 1
+ else if in_parallel_region:
+ // done reducing to one value per warp, now reduce across warps
+ return 3
+
+// OMP runtime function; one for each basic type.
+kmpc_reduce_block_double(double *a)
+ if lane == 0:
+ shared[wid] = *a
+ named_barrier(1, num_threads)
+ if wid == 0
+ block_reduce(shared)
+ if lane == 0
+ *a = shared[0]
+ named_barrier(1, num_threads)
+ if wid == 0 and lane == 0
+ return 1 // write back reduced result
+ else
+ return 0 // don't do anything
+
+```
+
+
+
+```
+// Compiler generated code.
+ 1. Create private copies of variables: foo_p, bar_p
+ 2. Each thread reduces the chunk of A and B assigned to it and writes
+ to foo_p and bar_p respectively.
+ 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
+ 4. if ret == 1:
+ The master thread stores the reduced result in the globals.
+ foo += reduceData.foo; bar += reduceData.bar
+ 5. else if ret == 3:
+ ret = block_reduce_double(reduceData.foo)
+ if ret == 1:
+ foo += reduceData.foo
+ ret = block_reduce_double(reduceData.bar)
+ if ret == 1:
+ bar += reduceData.bar
+```
+
+**Notes**
+
+ 1. This scheme requires that the CUDA OMP runtime can call llvm generated
+ functions. This functionality now works.
+ 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
+ (including calls through function pointers) are optimized away.
+ 3. If we are reducing multiple to multiple variables in a parallel region,
+ the reduce operations are all performed in warpReduce_[ir]regular(). This
+ results in more instructions in the loop and should result in fewer
+ stalls due to data dependencies. Unfortunately we cannot do the same in
+ kmpc_reduce_block_double() without increasing shared memory usage.
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu
new file mode 100644
index 0000000..77033db
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu
@@ -0,0 +1,28 @@
+//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used in the implementation of OpenMP cancel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
+ int32_t cancelVal) {
+ PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
+ // disabled
+ return FALSE;
+}
+
+EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
+ int32_t cancelVal) {
+ PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
+ // disabled
+ return FALSE;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/counter_group.h b/final/libomptarget/deviceRTLs/nvptx/src/counter_group.h
new file mode 100644
index 0000000..b183871
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/counter_group.h
@@ -0,0 +1,51 @@
+//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to implement OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_COUNTER_GROUP_H_
+#define _OMPTARGET_NVPTX_COUNTER_GROUP_H_
+
+#include "option.h"
+
+// counter group type for synchronizations
+class omptarget_nvptx_CounterGroup {
+public:
+ // getters and setters
+ INLINE Counter &Event() { return v_event; }
+ INLINE volatile Counter &Start() { return v_start; }
+ INLINE Counter &Init() { return v_init; }
+
+ // Synchronization Interface
+
+ INLINE void Clear(); // first time start=event
+ INLINE void Reset(); // init = first
+ INLINE void Init(Counter &priv); // priv = init
+ INLINE Counter Next(); // just counts number of events
+
+ // set priv to n, to be used in later waitOrRelease
+ INLINE void Complete(Counter &priv, Counter n);
+
+ // check priv and decide if we have to wait or can free the other warps
+ INLINE void Release(Counter priv, Counter current_event_value);
+ INLINE void WaitOrRelease(Counter priv, Counter current_event_value);
+
+private:
+ Counter v_event; // counter of events (atomic)
+
+ // volatile is needed to force loads to read from global
+ // memory or L2 cache and see the write by the last master
+ volatile Counter v_start; // signal when events registered are finished
+
+ Counter v_init; // used to initialize local thread variables
+};
+
+#endif /* SRC_COUNTER_GROUP_H_ */
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h b/final/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
new file mode 100644
index 0000000..f34de3e
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
@@ -0,0 +1,82 @@
+//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface implementation for OpenMP loop scheduling
+//
+//===----------------------------------------------------------------------===//
+
+#include "option.h"
+
+INLINE void omptarget_nvptx_CounterGroup::Clear() {
+ PRINT0(LD_SYNCD, "clear counters\n")
+ v_event = 0;
+ v_start = 0;
+ // v_init does not need to be reset (its value is dead)
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Reset() {
+ // done by master before entering parallel
+ ASSERT(LT_FUSSY, v_event == v_start,
+ "error, entry %lld !=start %lld at reset\n", P64(v_event),
+ P64(v_start));
+ v_init = v_start;
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Init(Counter &priv) {
+ PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", P64(&priv),
+ P64(v_start));
+ priv = v_start;
+}
+
+// just counts number of events
+INLINE Counter omptarget_nvptx_CounterGroup::Next() {
+ Counter oldVal = atomicAdd(&v_event, (Counter)1);
+ PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n",
+ P64(&v_event), P64(oldVal), P64(oldVal + 1));
+
+ return oldVal;
+}
+
+// set priv to n, to be used in later waitOrRelease
+INLINE void omptarget_nvptx_CounterGroup::Complete(Counter &priv, Counter n) {
+ PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %llu->%llu (+%llu)\n",
+ P64(&priv), P64(priv), P64(priv + n), n);
+ priv += n;
+}
+
+INLINE void omptarget_nvptx_CounterGroup::Release(Counter priv,
+ Counter current_event_value) {
+ if (priv - 1 == current_event_value) {
+ PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
+ P64(&v_start), P64(v_start), P64(priv));
+ v_start = priv;
+ }
+}
+
+// check priv and decide if we have to wait or can free the other warps
+INLINE void
+omptarget_nvptx_CounterGroup::WaitOrRelease(Counter priv,
+ Counter current_event_value) {
+ if (priv - 1 == current_event_value) {
+ PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
+ P64(&v_start), P64(v_start), P64(priv));
+ v_start = priv;
+ } else {
+ PRINT(LD_SYNCD,
+ "Start waiting while start counter 0x%llx with val %lld < %lld\n",
+ P64(&v_start), P64(v_start), P64(priv));
+ while (priv > v_start) {
+ // IDLE LOOP
+ // start is volatile: it will be re-loaded at each while loop
+ }
+ PRINT(LD_SYNCD,
+ "Done waiting as start counter 0x%llx with val %lld >= %lld\n",
+ P64(&v_start), P64(v_start), P64(priv));
+ }
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/critical.cu b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu
new file mode 100644
index 0000000..fef8101
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu
@@ -0,0 +1,32 @@
+//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of critical with KMPC interface
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+
+#include "omptarget-nvptx.h"
+
+EXTERN
+void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
+ kmp_CriticalName *lck) {
+ PRINT0(LD_IO, "call to kmpc_critical()\n");
+ omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+ omp_set_lock(teamDescr.CriticalLock());
+}
+
+EXTERN
+void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
+ kmp_CriticalName *lck) {
+ PRINT0(LD_IO, "call to kmpc_end_critical()\n");
+ omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+ omp_unset_lock(teamDescr.CriticalLock());
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
new file mode 100644
index 0000000..2a1709f
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -0,0 +1,513 @@
+//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments/
+//
+//===----------------------------------------------------------------------===//
+#include "omptarget-nvptx.h"
+#include <stdio.h>
+
+// Number of threads in the CUDA block.
+__device__ static unsigned getNumThreads() { return blockDim.x; }
+// Thread ID in the CUDA block
+__device__ static unsigned getThreadId() { return threadIdx.x; }
+// Warp ID in the CUDA block
+__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+__device__ static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
+// The CUDA thread ID of the master thread.
+__device__ static unsigned getMasterThreadId() {
+ unsigned Mask = WARPSIZE - 1;
+ return (getNumThreads() - 1) & (~Mask);
+}
+
+// Find the active threads in the warp - return a mask whose n-th bit is set if
+// the n-th thread in the warp is active.
+__device__ static unsigned getActiveThreadsMask() {
+ return __BALLOT_SYNC(0xFFFFFFFF, true);
+}
+
+// Return true if this is the first active thread in the warp.
+__device__ static bool IsWarpMasterActiveThread() {
+ unsigned long long Mask = getActiveThreadsMask();
+ unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE);
+ unsigned long long Sh = Mask << ShNum;
+ // Truncate Sh to the 32 lower bits
+ return (unsigned)Sh == 0;
+}
+// Return true if this is the master thread.
+__device__ static bool IsMasterThread() {
+ return !isSPMDMode() && getMasterThreadId() == getThreadId();
+}
+
+/// Return the provided size aligned to the size of a pointer.
+__device__ static size_t AlignVal(size_t Val) {
+ const size_t Align = (size_t)sizeof(void *);
+ if (Val & (Align - 1)) {
+ Val += Align;
+ Val &= ~(Align - 1);
+ }
+ return Val;
+}
+
+#define DSFLAG 0
+#define DSFLAG_INIT 0
+#define DSPRINT(_flag, _str, _args...) \
+ { \
+ if (_flag) { \
+ /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
+ } \
+ }
+#define DSPRINT0(_flag, _str) \
+ { \
+ if (_flag) { \
+ /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
+ } \
+ }
+
+// Initialize the shared data structures. This is expected to be called for the
+// master thread and warp masters. \param RootS: A pointer to the root of the
+// data sharing stack. \param InitialDataSize: The initial size of the data in
+// the slot.
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
+ size_t InitialDataSize) {
+
+ DSPRINT0(DSFLAG_INIT,
+ "Entering __kmpc_initialize_data_sharing_environment\n");
+
+ unsigned WID = getWarpId();
+ DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID);
+
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+ __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
+
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+ // We don't need to initialize the frame and active threads.
+
+ DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize);
+ DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS);
+ DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
+ (long long)RootS->DataEnd);
+ DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next);
+ DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
+ (long long)DataSharingState.SlotPtr[WID]);
+ DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
+ (long long)DataSharingState.StackPtr[WID]);
+
+ DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
+}
+
+EXTERN void *__kmpc_data_sharing_environment_begin(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads,
+ size_t SharingDataSize, size_t SharingDefaultDataSize,
+ int16_t IsOMPRuntimeInitialized) {
+
+ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
+
+ // If the runtime has been elided, used __shared__ memory for master-worker
+ // data sharing.
+ if (!IsOMPRuntimeInitialized)
+ return (void *)&DataSharingState;
+
+ DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
+ DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
+
+ unsigned WID = getWarpId();
+ unsigned CurActiveThreads = getActiveThreadsMask();
+
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+ void *&FrameP = DataSharingState.FramePtr[WID];
+ int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+ DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
+ // Save the current values.
+ *SavedSharedSlot = SlotP;
+ *SavedSharedStack = StackP;
+ *SavedSharedFrame = FrameP;
+ *SavedActiveThreads = ActiveT;
+
+ DSPRINT(DSFLAG, "Warp ID: %d\n", WID);
+ DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP);
+ DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP);
+ DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
+ DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
+
+ // Only the warp active master needs to grow the stack.
+ if (IsWarpMasterActiveThread()) {
+ // Save the current active threads.
+ ActiveT = CurActiveThreads;
+
+ // Make sure we use aligned sizes to avoid rematerialization of data.
+ SharingDataSize = AlignVal(SharingDataSize);
+ // FIXME: The default data size can be assumed to be aligned?
+ SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
+
+ // Check if we have room for the data in the current slot.
+ const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
+ const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
+ const uintptr_t RequiredEndAddress =
+ CurrentStartAddress + (uintptr_t)SharingDataSize;
+
+ DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
+ DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
+ DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress);
+ DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress);
+ DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress);
+ DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT);
+
+ // If we require a new slot, allocate it and initialize it (or attempt to
+ // reuse one). Also, set the shared stack and slot pointers to the new
+ // place. If we do not need to grow the stack, just adapt the stack and
+ // frame pointers.
+ if (CurrentEndAddress < RequiredEndAddress) {
+ size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
+ ? SharingDataSize
+ : SharingDefaultDataSize;
+ __kmpc_data_sharing_slot *NewSlot = 0;
+
+ // Attempt to reuse an existing slot.
+ if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+ uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+ (uintptr_t)(&ExistingSlot->Data[0]);
+ if (ExistingSlotSize >= NewSize) {
+ DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
+ (long long)ExistingSlot);
+ NewSlot = ExistingSlot;
+ } else {
+ DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
+ (long long)SlotP->Next);
+ free(ExistingSlot);
+ }
+ }
+
+ if (!NewSlot) {
+ NewSlot = (__kmpc_data_sharing_slot *)malloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize);
+ DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
+ (long long)NewSlot, NewSize);
+ }
+
+ NewSlot->Next = 0;
+ NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+ SlotP->Next = NewSlot;
+ SlotP = NewSlot;
+ StackP = &NewSlot->Data[SharingDataSize];
+ FrameP = &NewSlot->Data[0];
+ } else {
+
+ // Clean up any old slot that we may still have. The slot producers, do
+ // not eliminate them because that may be used to return data.
+ if (SlotP->Next) {
+ DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
+ (long long)SlotP->Next);
+ free(SlotP->Next);
+ SlotP->Next = 0;
+ }
+
+ FrameP = StackP;
+ StackP = (void *)RequiredEndAddress;
+ }
+ }
+
+ // FIXME: Need to see the impact of doing it here.
+ __threadfence_block();
+
+ DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
+
+ // All the threads in this warp get the frame they should work with.
+ return FrameP;
+}
+
+EXTERN void __kmpc_data_sharing_environment_end(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads,
+ int32_t IsEntryPoint) {
+
+ DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
+
+ unsigned WID = getWarpId();
+
+ if (IsEntryPoint) {
+ if (IsWarpMasterActiveThread()) {
+ DSPRINT0(DSFLAG, "Doing clean up\n");
+
+ // The master thread cleans the saved slot, because this is an environment
+ // only for the master.
+ __kmpc_data_sharing_slot *S =
+ IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID];
+
+ if (S->Next) {
+ free(S->Next);
+ S->Next = 0;
+ }
+ }
+
+ DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
+ return;
+ }
+
+ int32_t CurActive = getActiveThreadsMask();
+
+ // Only the warp master can restore the stack and frame information, and only
+ // if there are no other threads left behind in this environment (i.e. the
+ // warp diverged and returns in different places). This only works if we
+ // assume that threads will converge right after the call site that started
+ // the environment.
+ if (IsWarpMasterActiveThread()) {
+ int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+ DSPRINT0(DSFLAG, "Before restoring the stack\n");
+ // Zero the bits in the mask. If it is still different from zero, then we
+ // have other threads that will return after the current ones.
+ ActiveT &= ~CurActive;
+
+ DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive,
+ ActiveT);
+
+ if (!ActiveT) {
+ // No other active threads? Great, lets restore the stack.
+
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+ void *&FrameP = DataSharingState.FramePtr[WID];
+
+ SlotP = *SavedSharedSlot;
+ StackP = *SavedSharedStack;
+ FrameP = *SavedSharedFrame;
+ ActiveT = *SavedActiveThreads;
+
+ DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP);
+ DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP);
+ DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP);
+ DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
+ }
+ }
+
+ // FIXME: Need to see the impact of doing it here.
+ __threadfence_block();
+
+ DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
+ return;
+}
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+ int16_t IsOMPRuntimeInitialized) {
+ DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
+
+ // If the runtime has been elided, use __shared__ memory for master-worker
+ // data sharing. We're reusing the statically allocated data structure
+ // that is used for standard data sharing.
+ if (!IsOMPRuntimeInitialized)
+ return (void *)&DataSharingState;
+
+ // Get the frame used by the requested thread.
+
+ unsigned SourceWID = SourceThreadID / WARPSIZE;
+
+ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
+
+ void *P = DataSharingState.FramePtr[SourceWID];
+ DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
+ return P;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void data_sharing_init_stack_common() {
+ omptarget_nvptx_TeamDescr *teamDescr =
+ &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+ for (int WID = 0; WID < WARPSIZE; WID++) {
+ __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+ }
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
+EXTERN void __kmpc_data_sharing_init_stack() {
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ data_sharing_init_stack_common();
+ omptarget_nvptx_globalArgs.Init();
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+ // This function initializes the stack pointer with the pointer to the
+ // statically allocated shared memory slots. The size of a shared memory
+ // slot is pre-determined to be 256 bytes.
+ if (threadIdx.x == 0)
+ data_sharing_init_stack_common();
+
+ __threadfence_block();
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
+ int16_t UseSharedMemory) {
+ // Frame pointer must be visible to all workers in the same warp.
+ unsigned WID = getWarpId();
+ void *&FrameP = DataSharingState.FramePtr[WID];
+
+ // Only warp active master threads manage the stack.
+ if (IsWarpMasterActiveThread()) {
+ // SlotP will point to either the shared memory slot or an existing
+ // global memory slot.
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+ void *&StackP = DataSharingState.StackPtr[WID];
+
+ // Compute the total memory footprint of the requested data.
+ // The master thread requires a stack only for itself. A worker
+ // thread (which at this point is a warp master) will require
+ // space for the variables of each thread in the warp,
+ // i.e. one DataSize chunk per warp lane.
+ // TODO: change WARPSIZE to the number of active threads in the warp.
+ size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize;
+
+ // Check if we have room for the data in the current slot.
+ const uintptr_t StartAddress = (uintptr_t)StackP;
+ const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+ const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+ // If we requested more data than there is room for in the rest
+ // of the slot then we need to either re-use the next slot, if one exists,
+ // or create a new slot.
+ if (EndAddress < RequestedEndAddress) {
+ __kmpc_data_sharing_slot *NewSlot = 0;
+ size_t NewSize = PushSize;
+
+ // Allocate at least the default size for each type of slot.
+ // Master is a special case and even though there is only one thread,
+ // it can share more things with the workers. For uniformity, it uses
+ // the full size of a worker warp slot.
+ size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+ if (DefaultSlotSize > NewSize)
+ NewSize = DefaultSlotSize;
+ NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+ sizeof(__kmpc_data_sharing_slot) + NewSize,
+ "Global memory slot allocation.");
+
+ NewSlot->Next = 0;
+ NewSlot->Prev = SlotP;
+ NewSlot->PrevSlotStackPtr = StackP;
+ NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+ // Make previous slot point to the newly allocated slot.
+ SlotP->Next = NewSlot;
+ // The current slot becomes the new slot.
+ SlotP = NewSlot;
+ // The stack pointer always points to the next free stack frame.
+ StackP = &NewSlot->Data[0] + PushSize;
+ // The frame pointer always points to the beginning of the frame.
+ FrameP = &NewSlot->Data[0];
+ } else {
+ // Add the data chunk to the current slot. The frame pointer is set to
+ // point to the start of the new frame held in StackP.
+ FrameP = StackP;
+ // Reset stack pointer to the requested address.
+ StackP = (void *)RequestedEndAddress;
+ }
+ }
+
+ __threadfence_block();
+
+ // Compute the start address of the frame of each thread in the warp.
+ uintptr_t FrameStartAddress = (uintptr_t)FrameP;
+ FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
+ return (void *)FrameStartAddress;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+ if (IsWarpMasterActiveThread()) {
+ unsigned WID = getWarpId();
+
+ // Current slot
+ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+ // Pointer to next available stack.
+ void *&StackP = DataSharingState.StackPtr[WID];
+
+ // If the current slot is empty, we need to free the slot after the
+ // pop.
+ bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+ // Pop the frame.
+ StackP = FrameStart;
+
+ if (SlotEmpty && SlotP->Prev) {
+ // Before removing the slot we need to reset StackP.
+ StackP = SlotP->PrevSlotStackPtr;
+
+ // Remove the slot.
+ SlotP = SlotP->Prev;
+ SafeFree(SlotP->Next, "Free slot.");
+ SlotP->Next = 0;
+ }
+ }
+
+ __threadfence_block();
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+ omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+ omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+ *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/debug.h b/final/libomptarget/deviceRTLs/nvptx/src/debug.h
new file mode 100644
index 0000000..9f59d66
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -0,0 +1,276 @@
+//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+// Usage guide
+//
+// PRINT0(flag, str) : if debug flag is on, print (no arguments)
+// PRINT(flag, str, args) : if debug flag is on, print (arguments)
+// DON(flag) : return true if debug flag is on
+//
+// ASSERT(flag, cond, str, args): if test flag is on, test the condition
+// if the condition is false, print str+args
+// and assert.
+// CAUTION: cond may be evaluate twice
+// AON(flag) : return true if test flag is on
+//
+// WARNING(flag, str, args) : if warning flag is on, print the warning
+// WON(flag) : return true if warning flag is on
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_DEBUG_H_
+#define _OMPTARGET_NVPTX_DEBUG_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of debugging
+////////////////////////////////////////////////////////////////////////////////
+
+#define LD_SET_NONE 0ULL /* none */
+#define LD_SET_ALL -1ULL /* all */
+
+// pos 1
+#define LD_SET_LOOP 0x1ULL /* basic loop */
+#define LD_SET_LOOPD 0x2ULL /* basic loop */
+#define LD_SET_PAR 0x4ULL /* basic parallel */
+#define LD_SET_PARD 0x8ULL /* basic parallel */
+
+// pos 2
+#define LD_SET_SYNC 0x10ULL /* sync info */
+#define LD_SET_SYNCD 0x20ULL /* sync info */
+#define LD_SET_WAIT 0x40ULL /* state when waiting */
+#define LD_SET_TASK 0x80ULL /* print task info (high level) */
+
+// pos 3
+#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */
+#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */
+#define LD_SET_ENV 0x400ULL /* env info */
+#define LD_SET_CANCEL 0x800ULL /* print cancel info */
+
+// pos 4
+#define LD_SET_MEM 0x1000ULL /* malloc / free */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags to print selected output.
+
+// these are some examples of possible definitions that can be used for
+// debugging.
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
+// on cuda buffer
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
+
+#ifndef OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
+#elif OMPTARGET_NVPTX_DEBUG
+#warning debug is used, not good for measurements
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of asserts
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LT_SET_NONE 0x0 /* unsafe */
+#define LT_SET_SAFETY \
+ 0x1 /* check malloc type of stuff, input at creation, cheap */
+#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
+#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#ifndef OMPTARGET_NVPTX_TEST
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
+#else
+#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of warnings
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LW_SET_ALL -1
+#define LW_SET_NONE 0x0
+#define LW_SET_ENV 0x1
+#define LW_SET_INPUT 0x2
+#define LW_SET_FUSSY 0x4
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
+#else
+#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// implemtation for debug
+////////////////////////////////////////////////////////////////////////////////
+
+#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
+#include <stdio.h>
+#endif
+#if OMPTARGET_NVPTX_TEST
+#include <assert.h>
+#endif
+
+// set flags that are tested (inclusion properties)
+
+#define LD_ALL (LD_SET_ALL)
+
+#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
+#define LD_LOOPD (LD_SET_LOOPD)
+#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
+#define LD_PARD (LD_SET_PARD)
+
+// pos 2
+#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
+#define LD_SYNCD (LD_SET_SYNCD)
+#define LD_WAIT (LD_SET_WAIT)
+#define LD_TASK (LD_SET_TASK)
+
+// pos 3
+#define LD_IO (LD_SET_IO | LD_SET_IOD)
+#define LD_IOD (LD_SET_IOD)
+#define LD_ENV (LD_SET_ENV)
+#define LD_CANCEL (LD_SET_CANCEL)
+
+// pos 3
+#define LD_MEM (LD_SET_MEM)
+
+// implement
+#if OMPTARGET_NVPTX_DEBUG
+
+#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
+
+#define PRINT0(_flag, _str) \
+ { \
+ if (omptarget_device_environment.debug_level && DON(_flag)) { \
+ printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
+ threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
+ } \
+ }
+
+#define PRINT(_flag, _str, _args...) \
+ { \
+ if (omptarget_device_environment.debug_level && DON(_flag)) { \
+ printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
+ threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
+ } \
+ }
+#else
+
+#define DON(_flag) (FALSE)
+#define PRINT0(flag, str)
+#define PRINT(flag, str, _args...)
+
+#endif
+
+// for printing without worring about precision, pointers...
+#define P64(_x) ((unsigned long long)(_x))
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for test
+////////////////////////////////////////////////////////////////////////////////
+
+#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_FUSSY (LT_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str) \
+ { \
+ if (TON(_flag)) { \
+ assert(_cond); \
+ } \
+ }
+#define ASSERT(_flag, _cond, _str, _args...) \
+ { \
+ if (TON(_flag)) { \
+ assert(_cond); \
+ } \
+ }
+
+#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str) \
+ { \
+ if (TON(_flag) && !(_cond)) { \
+ printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x, \
+ threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
+ assert(_cond); \
+ } \
+ }
+#define ASSERT(_flag, _cond, _str, _args...) \
+ { \
+ if (TON(_flag) && !(_cond)) { \
+ printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x, \
+ threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
+ assert(_cond); \
+ } \
+ }
+
+#else
+
+#define TON(_flag) (FALSE)
+#define ASSERT0(_flag, _cond, _str)
+#define ASSERT(_flag, _cond, _str, _args...)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for warning
+
+#define LW_ALL (LW_SET_ALL)
+#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
+#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
+#define LW_FUSSY (LW_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_WARNING
+
+#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
+#define WARNING0(_flag, _str) \
+ { \
+ if (WON(_flag)) { \
+ printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
+ threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
+ } \
+ }
+#define WARNING(_flag, _str, _args...) \
+ { \
+ if (WON(_flag)) { \
+ printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
+ threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
+ } \
+ }
+
+#else
+
+#define WON(_flag) (FALSE)
+#define WARNING0(_flag, _str)
+#define WARNING(_flag, _str, _args...)
+
+#endif
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/interface.h b/final/libomptarget/deviceRTLs/nvptx/src/interface.h
new file mode 100644
index 0000000..680df48
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -0,0 +1,523 @@
+//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+// This file contains all the definitions that are relevant to
+// the interface. The first section contains the interface as
+// declared by OpenMP. A second section includes library private calls
+// (mostly debug, temporary?) The third section includes the compiler
+// specific interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _INTERFACES_H_
+#define _INTERFACES_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP interface
+////////////////////////////////////////////////////////////////////////////////
+
+typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
+typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
+
+typedef enum omp_sched_t {
+ omp_sched_static = 1, /* chunkSize >0 */
+ omp_sched_dynamic = 2, /* chunkSize >0 */
+ omp_sched_guided = 3, /* chunkSize >0 */
+ omp_sched_auto = 4, /* no chunkSize */
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t {
+ omp_proc_bind_false = 0,
+ omp_proc_bind_true = 1,
+ omp_proc_bind_master = 2,
+ omp_proc_bind_close = 3,
+ omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+EXTERN double omp_get_wtick(void);
+EXTERN double omp_get_wtime(void);
+
+EXTERN void omp_set_num_threads(int num);
+EXTERN int omp_get_num_threads(void);
+EXTERN int omp_get_max_threads(void);
+EXTERN int omp_get_thread_limit(void);
+EXTERN int omp_get_thread_num(void);
+EXTERN int omp_get_num_procs(void);
+EXTERN int omp_in_parallel(void);
+EXTERN int omp_in_final(void);
+EXTERN void omp_set_dynamic(int flag);
+EXTERN int omp_get_dynamic(void);
+EXTERN void omp_set_nested(int flag);
+EXTERN int omp_get_nested(void);
+EXTERN void omp_set_max_active_levels(int level);
+EXTERN int omp_get_max_active_levels(void);
+EXTERN int omp_get_level(void);
+EXTERN int omp_get_active_level(void);
+EXTERN int omp_get_ancestor_thread_num(int level);
+EXTERN int omp_get_team_size(int level);
+
+EXTERN void omp_init_lock(omp_lock_t *lock);
+EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_destroy_lock(omp_lock_t *lock);
+EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_set_lock(omp_lock_t *lock);
+EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_unset_lock(omp_lock_t *lock);
+EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
+EXTERN int omp_test_lock(omp_lock_t *lock);
+EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
+EXTERN omp_proc_bind_t omp_get_proc_bind(void);
+EXTERN int omp_get_cancellation(void);
+EXTERN void omp_set_default_device(int deviceId);
+EXTERN int omp_get_default_device(void);
+EXTERN int omp_get_num_devices(void);
+EXTERN int omp_get_num_teams(void);
+EXTERN int omp_get_team_num(void);
+EXTERN int omp_is_initial_device(void);
+EXTERN int omp_get_initial_device(void);
+EXTERN int omp_get_max_task_priority(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// OMPTARGET_NVPTX private (debug / temportary?) interface
+////////////////////////////////////////////////////////////////////////////////
+
+// for debug
+EXTERN void __kmpc_print_str(char *title);
+EXTERN void __kmpc_print_title_int(char *title, int data);
+EXTERN void __kmpc_print_index(char *title, int i);
+EXTERN void __kmpc_print_int(int data);
+EXTERN void __kmpc_print_double(double data);
+EXTERN void __kmpc_print_address_int64(int64_t data);
+
+////////////////////////////////////////////////////////////////////////////////
+// file below is swiped from kmpc host interface
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp specifc types
+////////////////////////////////////////////////////////////////////////////////
+
+typedef enum kmp_sched_t {
+ kmp_sched_static_chunk = 33,
+ kmp_sched_static_nochunk = 34,
+ kmp_sched_dynamic = 35,
+ kmp_sched_guided = 36,
+ kmp_sched_runtime = 37,
+ kmp_sched_auto = 38,
+
+ kmp_sched_static_ordered = 65,
+ kmp_sched_static_nochunk_ordered = 66,
+ kmp_sched_dynamic_ordered = 67,
+ kmp_sched_guided_ordered = 68,
+ kmp_sched_runtime_ordered = 69,
+ kmp_sched_auto_ordered = 70,
+
+ kmp_sched_distr_static_chunk = 91,
+ kmp_sched_distr_static_nochunk = 92,
+ kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+ kmp_sched_default = kmp_sched_static_nochunk,
+ kmp_sched_unordered_first = kmp_sched_static_chunk,
+ kmp_sched_unordered_last = kmp_sched_auto,
+ kmp_sched_ordered_first = kmp_sched_static_ordered,
+ kmp_sched_ordered_last = kmp_sched_auto_ordered,
+ kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+ kmp_sched_distribute_last =
+ kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+ /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+ * Since we need to distinguish the three possible cases (no modifier,
+ * monotonic modifier, nonmonotonic modifier), we need separate bits for
+ * each modifier. The absence of monotonic does not imply nonmonotonic,
+ * especially since 4.5 says that the behaviour of the "no modifier" case
+ * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+ *
+ * Since we're passing a full 32 bit value, we can use a couple of high
+ * bits for these flags; out of paranoia we avoid the sign bit.
+ *
+ * These modifiers can be or-ed into non-static schedules by the compiler
+ * to pass the additional information. They will be stripped early in the
+ * processing in __kmp_dispatch_init when setting up schedules, so
+ * most of the code won't ever see schedules with these bits set.
+ */
+ kmp_sched_modifier_monotonic = (1 << 29),
+ /**< Set if the monotonic schedule modifier was present */
+ kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s) \
+ (enum kmp_sched_t)( \
+ (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s) \
+ (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s) \
+ (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+ 0)
+
+} kmp_sched_t;
+
+// parallel defs
+typedef void kmp_Indent;
+typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
+typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
+typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
+typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
+ int16_t lane_offset,
+ int16_t shortCircuit);
+typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
+ int32_t index, int32_t width);
+typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
+ int32_t index, int32_t width,
+ int32_t reduce);
+
+// task defs
+typedef struct kmp_TaskDescr kmp_TaskDescr;
+typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
+typedef struct kmp_TaskDescr {
+ void *sharedPointerTable; // ptr to a table of shared var ptrs
+ kmp_TaskFctPtr sub; // task subroutine
+ int32_t partId; // unused
+ kmp_TaskFctPtr destructors; // destructor of c++ first private
+} kmp_TaskDescr;
+// task dep defs
+#define KMP_TASKDEP_IN 0x1u
+#define KMP_TASKDEP_OUT 0x2u
+typedef struct kmp_TaskDep_Public {
+ void *addr;
+ size_t len;
+ uint8_t flags; // bit 0: in, bit 1: out
+} kmp_TaskDep_Public;
+
+// flags that interpret the interface part of tasking flags
+#define KMP_TASK_IS_TIED 0x1
+#define KMP_TASK_FINAL 0x2
+#define KMP_TASK_MERGED_IF0 0x4 /* unused */
+#define KMP_TASK_DESTRUCTOR_THUNK 0x8
+
+// flags for task setup return
+#define KMP_CURRENT_TASK_NOT_SUSPENDED 0
+#define KMP_CURRENT_TASK_SUSPENDED 1
+
+// sync defs
+typedef int32_t kmp_CriticalName[8];
+
+////////////////////////////////////////////////////////////////////////////////
+// flags for kstate (all bits initially off)
+////////////////////////////////////////////////////////////////////////////////
+
+// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp)
+#define KMP_REDUCTION_MASK 0x3
+#define KMP_SKIP_NEXT_CALL 0x4
+#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8
+
+////////////////////////////////////////////////////////////////////////////////
+// data
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// external interface
+////////////////////////////////////////////////////////////////////////////////
+
+// query
+EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing
+EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc); // missing
+EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc); // missing
+EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc); // missing
+
+// parallel
+EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc);
+EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid,
+ int32_t num_threads);
+// simd
+EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t global_tid,
+ int32_t simd_limit);
+// aee ... not supported
+// EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr
+// microtask, ...);
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
+ uint32_t global_tid);
+EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid);
+
+// proc bind
+EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t global_tid,
+ int proc_bind);
+EXTERN int omp_get_num_places(void);
+EXTERN int omp_get_place_num_procs(int place_num);
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
+EXTERN int omp_get_place_num(void);
+EXTERN int omp_get_partition_num_places(void);
+EXTERN void omp_get_partition_place_nums(int *place_nums);
+
+// for static (no chunk or chunk)
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk);
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter1,
+ uint64_t *plower, uint64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t *plastiter1,
+ uint64_t *plower, uint64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(kmp_Indent *loc,
+ int32_t global_tid, int32_t sched,
+ int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(kmp_Indent *loc,
+ int32_t global_tid, int32_t sched,
+ int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
+ uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk);
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid);
+
+// for dynamic
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int32_t lower, int32_t upper,
+ int32_t incr, int32_t chunk);
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, uint32_t lower,
+ uint32_t upper, int32_t incr,
+ int32_t chunk);
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, int64_t lower, int64_t upper,
+ int64_t incr, int64_t chunk);
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid,
+ int32_t sched, uint64_t lower,
+ uint64_t upper, int64_t incr,
+ int64_t chunk);
+
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid,
+ int32_t *plastiter, int32_t *plower,
+ int32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid,
+ int32_t *plastiter, uint32_t *plower,
+ uint32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid,
+ int32_t *plastiter, int64_t *plower,
+ int64_t *pupper, int64_t *pstride);
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid,
+ int32_t *plastiter, uint64_t *plower,
+ uint64_t *pupper, int64_t *pstride);
+
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid);
+
+// Support for reducing conditional lastprivate variables
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc,
+ int32_t global_tid,
+ int32_t varNum, void *array);
+
+// reduction
+EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
+EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+
+// sync barrier
+EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid);
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid);
+
+// single
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid);
+
+// sync
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid);
+EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
+ kmp_CriticalName *crit);
+EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
+ kmp_CriticalName *crit);
+EXTERN void __kmpc_flush(kmp_Indent *loc);
+
+// vote
+EXTERN int32_t __kmpc_warp_active_thread_mask();
+
+// tasks
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc,
+ uint32_t global_tid, int32_t flag,
+ size_t sizeOfTaskInclPrivate,
+ size_t sizeOfSharedTable,
+ kmp_TaskFctPtr sub);
+EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newLegacyTaskDescr,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum,
+ void *noAliasDepList);
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList);
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid);
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
+ int end_part);
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid);
+EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr, int if_val,
+ uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+ int32_t sched, uint64_t grainsize, void *task_dup);
+
+// cancel
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
+ int32_t cancelVal);
+EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
+ int32_t cancelVal);
+
+// non standard
+EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+ int16_t RequiresDataSharing);
+EXTERN void __kmpc_spmd_kernel_deinit();
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ int16_t IsOMPRuntimeInitialized);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+ int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_end_parallel();
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
+ bool *IsFinal,
+ int32_t *LaneSource);
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
+ bool *IsFinal, int32_t *LaneSource,
+ int32_t *LaneId, int32_t *NumLanes);
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
+
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+// The slot used for data sharing by the master and worker threads. We use a
+// complete (default size version and an incomplete one so that we allow sizes
+// greater than the default).
+struct __kmpc_data_sharing_slot {
+ __kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
+ void *PrevSlotStackPtr;
+ void *DataEnd;
+ char Data[];
+};
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
+ size_t InitialDataSize);
+EXTERN void *__kmpc_data_sharing_environment_begin(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads,
+ size_t SharingDataSize, size_t SharingDefaultDataSize,
+ int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_data_sharing_environment_end(
+ __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+ void **SavedSharedFrame, int32_t *SavedActiveThreads, int32_t IsEntryPoint);
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+ int16_t IsOMPRuntimeInitialized);
+
+// SPMD execution mode interrogation function.
+EXTERN int8_t __kmpc_is_spmd_exec_mode();
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu
new file mode 100644
index 0000000..15040db
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -0,0 +1,462 @@
+//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+// Timer precision is 1ns
+#define TIMER_PRECISION ((double)1E-9)
+
+EXTERN double omp_get_wtick(void) {
+ PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION);
+ return TIMER_PRECISION;
+}
+
+EXTERN double omp_get_wtime(void) {
+ unsigned long long nsecs;
+ asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
+ double rc = (double)nsecs * TIMER_PRECISION;
+ PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
+ return rc;
+}
+
+EXTERN void omp_set_num_threads(int num) {
+ PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
+ if (num <= 0) {
+ WARNING0(LW_INPUT, "expected positive num; ignore\n");
+ } else {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ currTaskDescr->NThreads() = num;
+ }
+}
+
+EXTERN int omp_get_num_threads(void) {
+ int tid = GetLogicalThreadIdInBlock();
+ int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+ PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_max_threads(void) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ int rc = 1; // default is 1 thread avail
+ if (!currTaskDescr->InParallelRegion()) {
+ // not currently in a parallel region... all are available
+ rc = GetNumberOfProcsInTeam();
+ ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
+ }
+ PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_thread_limit(void) {
+ // per contention group.. meaning threads in current team
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ int rc = currTaskDescr->ThreadLimit();
+ PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_thread_num() {
+ int tid = GetLogicalThreadIdInBlock();
+ int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+ PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_num_procs(void) {
+ int rc = GetNumberOfProcsInDevice();
+ PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_in_parallel(void) {
+ int rc = 0;
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ if (currTaskDescr->InParallelRegion()) {
+ rc = 1;
+ }
+ PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_in_final(void) {
+ // treat all tasks as final... Specs may expect runtime to keep
+ // track more precisely if a task was actively set by users... This
+ // is not explicitely specified; will treat as if runtime can
+ // actively decide to put a non-final task into a final one.
+ int rc = 1;
+ PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN void omp_set_dynamic(int flag) {
+ PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
+
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ if (flag) {
+ currTaskDescr->SetDynamic();
+ } else {
+ currTaskDescr->ClearDynamic();
+ }
+}
+
+EXTERN int omp_get_dynamic(void) {
+ int rc = 0;
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ if (currTaskDescr->IsDynamic()) {
+ rc = 1;
+ }
+ PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN void omp_set_nested(int flag) {
+ PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
+ flag);
+}
+
+EXTERN int omp_get_nested(void) {
+ int rc = 0;
+ PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN void omp_set_max_active_levels(int level) {
+ PRINT(LD_IO,
+ "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
+ level);
+}
+
+EXTERN int omp_get_max_active_levels(void) {
+ int rc = 1;
+ PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_level(void) {
+ int level = 0;
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ ASSERT0(LT_FUSSY, currTaskDescr,
+ "do not expect fct to be called in a non-active thread");
+ do {
+ if (currTaskDescr->IsParallelConstruct()) {
+ level++;
+ }
+ currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+ } while (currTaskDescr);
+ PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
+ return level;
+}
+
+EXTERN int omp_get_active_level(void) {
+ int level = 0; // no active level parallelism
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ ASSERT0(LT_FUSSY, currTaskDescr,
+ "do not expect fct to be called in a non-active thread");
+ do {
+ if (currTaskDescr->ThreadsInTeam() > 1) {
+ // has a parallel with more than one thread in team
+ level = 1;
+ break;
+ }
+ currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+ } while (currTaskDescr);
+ PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
+ return level;
+}
+
+EXTERN int omp_get_ancestor_thread_num(int level) {
+ int rc = 0; // default at level 0
+ if (level >= 0) {
+ int totLevel = omp_get_level();
+ if (level <= totLevel) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ int steps = totLevel - level;
+ PRINT(LD_IO, "backtrack %d steps\n", steps);
+ ASSERT0(LT_FUSSY, currTaskDescr,
+ "do not expect fct to be called in a non-active thread");
+ do {
+ if (DON(LD_IOD)) {
+ // print current state
+ omp_sched_t sched = currTaskDescr->GetRuntimeSched();
+ PRINT(LD_ALL,
+ "task descr %s %d: %s, in par %d, dyn %d, rt sched %d,"
+ " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
+ "ancestor", steps,
+ (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
+ currTaskDescr->InParallelRegion(), currTaskDescr->IsDynamic(),
+ sched, currTaskDescr->RuntimeChunkSize(),
+ currTaskDescr->ThreadId(), currTaskDescr->ThreadsInTeam(),
+ currTaskDescr->NThreads());
+ }
+
+ if (currTaskDescr->IsParallelConstruct()) {
+ // found the level
+ if (!steps) {
+ rc = currTaskDescr->ThreadId();
+ break;
+ }
+ steps--;
+ }
+ currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+ } while (currTaskDescr);
+ ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+ }
+ }
+ PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
+ rc)
+ return rc;
+}
+
+EXTERN int omp_get_team_size(int level) {
+ int rc = 1; // default at level 0
+ if (level >= 0) {
+ int totLevel = omp_get_level();
+ if (level <= totLevel) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ int steps = totLevel - level;
+ ASSERT0(LT_FUSSY, currTaskDescr,
+ "do not expect fct to be called in a non-active thread");
+ do {
+ if (currTaskDescr->IsParallelConstruct()) {
+ if (!steps) {
+ // found the level
+ rc = currTaskDescr->ThreadsInTeam();
+ break;
+ }
+ steps--;
+ }
+ currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+ } while (currTaskDescr);
+ ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+ }
+ }
+ PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
+ return rc;
+}
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ *kind = currTaskDescr->GetRuntimeSched();
+ *modifier = currTaskDescr->RuntimeChunkSize();
+ PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
+ (int)*kind, *modifier);
+}
+
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
+ PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
+ modifier);
+ if (kind >= omp_sched_static && kind < omp_sched_auto) {
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ currTaskDescr->SetRuntimeSched(kind);
+ currTaskDescr->RuntimeChunkSize() = modifier;
+ PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
+ (int)currTaskDescr->GetRuntimeSched(),
+ currTaskDescr->RuntimeChunkSize());
+ }
+}
+
+EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
+ PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
+ return omp_proc_bind_true;
+}
+
+EXTERN int omp_get_num_places(void) {
+ PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
+ return 0;
+}
+
+EXTERN int omp_get_place_num_procs(int place_num) {
+ PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
+ return 0;
+}
+
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
+ PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
+}
+
+EXTERN int omp_get_place_num(void) {
+ PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
+ return 0;
+}
+
+EXTERN int omp_get_partition_num_places(void) {
+ PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
+ return 0;
+}
+
+EXTERN void omp_get_partition_place_nums(int *place_nums) {
+ PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
+}
+
+EXTERN int omp_get_cancellation(void) {
+ int rc = FALSE; // currently false only
+ PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN void omp_set_default_device(int deviceId) {
+ PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
+}
+
+EXTERN int omp_get_default_device(void) {
+ PRINT0(LD_IO,
+ "call omp_get_default_device() is undef on device, returns 0\n");
+ return 0;
+}
+
+EXTERN int omp_get_num_devices(void) {
+ PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
+ return 0;
+}
+
+EXTERN int omp_get_num_teams(void) {
+ int rc = GetNumberOfOmpTeams();
+ PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_get_team_num() {
+ int rc = GetOmpTeamId();
+ PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_is_initial_device(void) {
+ PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+ return 0; // 0 by def on device
+}
+
+// Unspecified on the device.
+EXTERN int omp_get_initial_device(void) {
+ PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
+ return 0;
+}
+
+// Unused for now.
+EXTERN int omp_get_max_task_priority(void) {
+ PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
+ return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// locks
+////////////////////////////////////////////////////////////////////////////////
+
+#define __OMP_SPIN 1000
+#define UNSET 0
+#define SET 1
+
+EXTERN void omp_init_lock(omp_lock_t *lock) {
+ *lock = UNSET;
+ PRINT0(LD_IO, "call omp_init_lock()\n");
+}
+
+EXTERN void omp_destroy_lock(omp_lock_t *lock) {
+ PRINT0(LD_IO, "call omp_destroy_lock()\n");
+}
+
+EXTERN void omp_set_lock(omp_lock_t *lock) {
+ // int atomicCAS(int* address, int compare, int val);
+ // (old == compare ? val : old)
+ int compare = UNSET;
+ int val = SET;
+
+ // TODO: not sure spinning is a good idea here..
+ while (atomicCAS(lock, compare, val) != UNSET) {
+
+ clock_t start = clock();
+ clock_t now;
+ for (;;) {
+ now = clock();
+ clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+ if (cycles >= __OMP_SPIN * blockIdx.x) {
+ break;
+ }
+ }
+ } // wait for 0 to be the read value
+
+ PRINT0(LD_IO, "call omp_set_lock()\n");
+}
+
+EXTERN void omp_unset_lock(omp_lock_t *lock) {
+ int compare = SET;
+ int val = UNSET;
+ int old = atomicCAS(lock, compare, val);
+
+ PRINT0(LD_IO, "call omp_unset_lock()\n");
+}
+
+EXTERN int omp_test_lock(omp_lock_t *lock) {
+ // int atomicCAS(int* address, int compare, int val);
+ // (old == compare ? val : old)
+ int compare = UNSET;
+ int val = SET;
+
+ int ret = atomicCAS(lock, compare, val);
+
+ PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
+
+ return ret;
+}
+
+// for xlf Fotran
+// Fotran, the return is LOGICAL type
+
+#define FLOGICAL long
+EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
+ int ret = omp_is_initial_device();
+ if (ret == 0)
+ return (FLOGICAL)0;
+ else
+ return (FLOGICAL)1;
+}
+
+EXTERN int __xlf_omp_is_initial_device_i4() {
+ int ret = omp_is_initial_device();
+ if (ret == 0)
+ return 0;
+ else
+ return 1;
+}
+
+EXTERN long __xlf_omp_get_team_num_i4() {
+ int ret = omp_get_team_num();
+ return (long)ret;
+}
+
+EXTERN long __xlf_omp_get_num_teams_i4() {
+ int ret = omp_get_num_teams();
+ return (long)ret;
+}
+
+EXTERN void xlf_debug_print_int(int *p) {
+ printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_long(long *p) {
+ printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_float(float *p) {
+ printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_double(double *p) {
+ printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_addr(void *p) {
+ printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/loop.cu b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
new file mode 100644
index 0000000..f3e475d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,769 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64) (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+ ////////////////////////////////////////////////////////////////////////////////
+ // Loop with static scheduling with chunk
+
+ // Generic implementation of OMP loop scheduling with static policy
+ /*! \brief Calculate initial bounds for static loop and stride
+ * @param[in] loc location in code of the call (not used here)
+ * @param[in] global_tid global thread id
+ * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+ * @param[in] plastiter pointer to last iteration
+ * @param[in,out] pointer to loop lower bound. it will contain value of
+ * lower bound of first chunk
+ * @param[in,out] pointer to loop upper bound. It will contain value of
+ * upper bound of first chunk
+ * @param[in,out] pointer to loop stride. It will contain value of stride
+ * between two successive chunks executed by the same thread
+ * @param[in] loop increment bump
+ * @param[in] chunk size
+ */
+
+ // helper function for static chunk
+ INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+ ST chunk, T entityId, T numberOfEntities) {
+ // each thread executes multiple chunks all of the same size, except
+ // the last one
+
+ // distance between two successive chunks
+ stride = numberOfEntities * chunk;
+ lb = lb + entityId * chunk;
+ T inputUb = ub;
+ ub = lb + chunk - 1; // Clang uses i <= ub
+ // Say ub' is the begining of the last chunk. Then who ever has a
+ // lower bound plus a multiple of the increment equal to ub' is
+ // the last one.
+ T beginingLastChunk = inputUb - (inputUb % chunk);
+ last = ((beginingLastChunk - lb) % stride) == 0;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Loop with static scheduling without chunk
+
+ // helper function for static no chunk
+ INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+ ST &chunk, T entityId,
+ T numberOfEntities) {
+ // No chunk size specified. Each thread or warp gets at most one
+ // chunk; chunks are all almost of equal size
+ T loopSize = ub - lb + 1;
+
+ chunk = loopSize / numberOfEntities;
+ T leftOver = loopSize - chunk * numberOfEntities;
+
+ if (entityId < leftOver) {
+ chunk++;
+ lb = lb + entityId * chunk;
+ } else {
+ lb = lb + entityId * chunk + leftOver;
+ }
+
+ T inputUb = ub;
+ ub = lb + chunk - 1; // Clang uses i <= ub
+ last = lb <= inputUb && inputUb <= ub;
+ stride = loopSize; // make sure we only do 1 chunk per warp
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for Static Init
+
+ INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
+ T *plower, T *pupper, ST *pstride,
+ ST chunk, bool IsSPMDExecutionMode,
+ bool IsRuntimeUninitialized) {
+ // When IsRuntimeUninitialized is true, we assume that the caller is
+ // in an L0 parallel region and that all worker threads participate.
+
+ int tid = GetLogicalThreadIdInBlock();
+
+ // Assume we are in teams region or that we use a single block
+ // per target region
+ ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+
+ // All warps that are in excess of the maximum requested, do
+ // not execute the loop
+ PRINT(LD_LOOP,
+ "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+ "%d, num tids %d\n",
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+ schedtype, P64(chunk),
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized));
+ ASSERT0(
+ LT_FUSSY,
+ (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
+ (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized)),
+ "current thread is not needed here; error");
+
+ // copy
+ int lastiter = 0;
+ T lb = *plower;
+ T ub = *pupper;
+ ST stride = *pstride;
+ T entityId, numberOfEntities;
+ // init
+ switch (schedtype) {
+ case kmp_sched_static_chunk: {
+ if (chunk > 0) {
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ } // note: if chunk <=0, use nochunk
+ case kmp_sched_static_nochunk: {
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ case kmp_sched_distr_static_chunk: {
+ if (chunk > 0) {
+ entityId = GetOmpTeamId();
+ numberOfEntities = GetNumberOfOmpTeams();
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ } // note: if chunk <=0, use nochunk
+ }
+ case kmp_sched_distr_static_nochunk: {
+ entityId = GetOmpTeamId();
+ numberOfEntities = GetNumberOfOmpTeams();
+
+ ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+ entityId =
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized) *
+ GetOmpTeamId() +
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpTeams() *
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ break;
+ }
+ default: {
+ ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
+ PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+ schedtype);
+ entityId =
+ GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
+ numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized);
+ ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
+ numberOfEntities);
+ }
+ }
+ // copy back
+ *plastiter = lastiter;
+ *plower = lb;
+ *pupper = ub;
+ *pstride = stride;
+ PRINT(LD_LOOP,
+ "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
+ GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
+ IsRuntimeUninitialized),
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
+ P64(*pstride));
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for dispatch Init
+
+ INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+ return schedule >= kmp_sched_ordered_first &&
+ schedule <= kmp_sched_ordered_last;
+ }
+
+ INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+ kmp_sched_t schedule, T lb, T ub, ST st,
+ ST chunk) {
+ int tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+ T tnum = currTaskDescr->ThreadsInTeam();
+ T tripCount = ub - lb + 1; // +1 because ub is inclusive
+ ASSERT0(
+ LT_FUSSY,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ "current thread is not needed here; error");
+
+ /* Currently just ignore the monotonic and non-monotonic modifiers
+ * (the compiler isn't producing them * yet anyway).
+ * When it is we'll want to look at them somewhere here and use that
+ * information to add to our schedule choice. We shouldn't need to pass
+ * them on, they merely affect which schedule we can legally choose for
+ * various dynamic cases. (In paritcular, whether or not a stealing scheme
+ * is legal).
+ */
+ schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+ // Process schedule.
+ if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+ if (OrderedSchedule(schedule))
+ __kmpc_barrier(loc, threadId);
+ PRINT(LD_LOOP,
+ "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+ (long)tnum, P64(tripCount), schedule);
+ schedule = kmp_sched_static_chunk;
+ chunk = tripCount; // one thread gets the whole loop
+ } else if (schedule == kmp_sched_runtime) {
+ // process runtime
+ omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+ chunk = currTaskDescr->RuntimeChunkSize();
+ switch (rtSched) {
+ case omp_sched_static: {
+ if (chunk > 0)
+ schedule = kmp_sched_static_chunk;
+ else
+ schedule = kmp_sched_static_nochunk;
+ break;
+ }
+ case omp_sched_auto: {
+ schedule = kmp_sched_static_chunk;
+ chunk = 1;
+ break;
+ }
+ case omp_sched_dynamic:
+ case omp_sched_guided: {
+ schedule = kmp_sched_dynamic;
+ break;
+ }
+ }
+ PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
+ P64(chunk));
+ } else if (schedule == kmp_sched_auto) {
+ schedule = kmp_sched_static_chunk;
+ chunk = 1;
+ PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
+ P64(chunk));
+ } else {
+ PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
+ ASSERT(LT_FUSSY,
+ schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+ "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
+ }
+
+ // init schedules
+ if (schedule == kmp_sched_static_chunk) {
+ ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ // save ub
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ // compute static chunk
+ ST stride;
+ int lastiter = 0;
+ ForStaticChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+ // save computed params
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+ PRINT(LD_LOOP,
+ "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+ } else if (schedule == kmp_sched_static_nochunk) {
+ ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+ // save sched state
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ // save ub
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+ // compute static chunk
+ ST stride;
+ int lastiter = 0;
+ ForStaticNoChunk(
+ lastiter, lb, ub, stride, chunk,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()), tnum);
+ // save computed params
+ omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+ omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+ PRINT(LD_LOOP,
+ "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+ ", next lower bound = %llu, stride = %llu\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+ omptarget_nvptx_threadPrivateContext->Stride(tid));
+
+ } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+ __kmpc_barrier(loc, threadId);
+ // save sched state
+ int teamId = GetOmpTeamId();
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+ if (GetThreadIdInBlock() == 0) {
+ if (chunk < 1)
+ chunk = 1;
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+ }
+ __kmpc_barrier(loc, threadId);
+ PRINT(LD_LOOP,
+ "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+ ", chunk %" PRIu64 "\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+ omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Support for dispatch next
+
+ INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+ int64_t &loopLowerBound,
+ T loopUpperBound) {
+ // calculate lower bound for all lanes in the warp
+ lb = atomicAdd((unsigned long long *)&loopLowerBound,
+ (unsigned long long)chunkSize);
+ ub = lb + chunkSize - 1; // Clang uses i <= ub
+
+ // 3 result cases:
+ // a. lb and ub < loopUpperBound --> NOT_FINISHED
+ // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+ // NOT_FINISHED
+ // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+ // a.
+ if (lb <= loopUpperBound && ub < loopUpperBound) {
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
+ P64(ub), P64(loopUpperBound));
+ return NOT_FINISHED;
+ }
+ // b.
+ if (lb <= loopUpperBound) {
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+ P64(lb), P64(ub), P64(loopUpperBound));
+ ub = loopUpperBound;
+ return LAST_CHUNK;
+ }
+ // c. if we are here, we are in case 'c'
+ lb = loopUpperBound + 2;
+ ub = loopUpperBound + 1;
+ PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
+ P64(ub), P64(loopUpperBound));
+ return FINISHED;
+ }
+
+ // On Pascal, with inlining of the runtime into the user application,
+ // this code deadlocks. This is probably because different threads
+ // in a warp cannot make independent progress.
+ NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
+ ST *pstride) {
+ // ID of a thread in its own warp
+
+ // automatically selects thread or warp ID based on selected implementation
+ int tid = GetLogicalThreadIdInBlock();
+ ASSERT0(
+ LT_FUSSY,
+ GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ "current thread is not needed here; error");
+ // retrieve schedule
+ kmp_sched_t schedule =
+ omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+ // xxx reduce to one
+ if (schedule == kmp_sched_static_chunk ||
+ schedule == kmp_sched_static_nochunk) {
+ T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+ T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+ // finished?
+ if (myLb > ub) {
+ PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+ P64(myLb), P64(ub));
+ return DISPATCH_FINISHED;
+ }
+ // not finished, save current bounds
+ ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+ *plower = myLb;
+ T myUb = myLb + chunk - 1; // Clang uses i <= ub
+ if (myUb > ub)
+ myUb = ub;
+ *pupper = myUb;
+ *plast = (int32_t)(myUb == ub);
+
+ // increment next lower bound by the stride
+ ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+ PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+ P64(*plower), P64(*pupper));
+ return DISPATCH_NOTFINISHED;
+ }
+ ASSERT0(LT_FUSSY,
+ schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+ "bad sched");
+ T myLb, myUb;
+ int teamId = GetOmpTeamId();
+ int finished = DynamicNextChunk(
+ myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+
+ if (finished == FINISHED)
+ return DISPATCH_FINISHED;
+
+ // not finished (either not finished or last chunk)
+ *plast = (int32_t)(finished == LAST_CHUNK);
+ *plower = myLb;
+ *pupper = myUb;
+ *pstride = 1;
+
+ PRINT(LD_LOOP,
+ "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+ GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
+ P64(*pstride));
+ return DISPATCH_NOTFINISHED;
+ }
+
+ INLINE static void dispatch_fini() {
+ // nothing
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // end of template class that encapsulate all the helper functions
+ ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, int32_t lb, int32_t ub,
+ int32_t st, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, uint32_t lb, uint32_t ub,
+ int32_t st, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, int64_t lb, int64_t ub,
+ int64_t st, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
+ int32_t schedule, uint64_t lb, uint64_t ub,
+ int64_t st, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+ loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last,
+ int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+ return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid,
+ int32_t *p_last, uint32_t *p_lb,
+ uint32_t *p_ub, int32_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+ return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last,
+ int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+ return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid,
+ int32_t *p_last, uint64_t *p_lb,
+ uint64_t *p_ub, int64_t *p_st) {
+ PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+ return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+ p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+ isRuntimeUninitialized());
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper,
+ int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter, uint32_t *plower,
+ uint32_t *pupper, int32_t *pstride,
+ int32_t incr, int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper,
+ int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
+ int32_t schedtype,
+ int32_t *plastiter, uint64_t *plower,
+ uint64_t *pupper, int64_t *pstride,
+ int64_t incr, int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/true,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+ omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+ int32_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+ omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+ omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+ kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+ uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+ int64_t chunk) {
+ PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+ omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+ schedtype, plastiter, plower, pupper, pstride, chunk,
+ /*IsSPMDExecutionMode=*/false,
+ /*IsRuntimeUninitialized=*/true);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+ int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ // On Volta and newer architectures we require that all lanes in
+ // a warp (at least, all present for the kernel launch) participate in the
+ // barrier. This is enforced when launching the parallel region. An
+ // exception is when there are < WARPSIZE workers. In this case only 1 worker
+ // is started, so we don't need a barrier.
+ if (NumThreads > 1) {
+#endif
+ named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ }
+#endif
+}
+}; // namespace
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
+ int32_t varNum, void *array) {
+ PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+
+ omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+ int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
+ isRuntimeUninitialized());
+ uint32_t NumThreads = GetNumberOfOmpThreads(
+ GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized());
+ uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+ for (unsigned i = 0; i < varNum; i++) {
+ // Reset buffer.
+ if (tid == 0)
+ *Buffer = 0; // Reset to minimum loop iteration value.
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+
+ // Atomic max of iterations.
+ uint64_t *varArray = (uint64_t *)array;
+ uint64_t elem = varArray[i];
+ (void)atomicMax((unsigned long long int *)Buffer,
+ (unsigned long long int)elem);
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+
+ // Read max value and update thread private array.
+ varArray[i] = *Buffer;
+
+ // Barrier.
+ syncWorkersInGenericMode(NumThreads);
+ }
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
new file mode 100644
index 0000000..149af8d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -0,0 +1,59 @@
+//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data objects used on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global device envrionment
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+// global data holding OpenMP state information
+////////////////////////////////////////////////////////////////////////////////
+
+__device__
+ omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+ omptarget_nvptx_device_State[MAX_SM];
+
+// Pointer to this team's OpenMP state object
+__device__ __shared__
+ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+////////////////////////////////////////////////////////////////////////////////
+// The team master sets the outlined parallel function in this variable to
+// communicate with the workers. Since it is in shared memory, there is one
+// copy of these variables for each kernel, instance, and team.
+////////////////////////////////////////////////////////////////////////////////
+volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP kernel execution parameters
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing state
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// Scratchpad for teams reduction.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
new file mode 100644
index 0000000..677654d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -0,0 +1,194 @@
+//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initialization code for the GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__
+ omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+ omptarget_nvptx_device_State[MAX_SM];
+
+extern __device__ __shared__
+ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+//
+// The team master sets the outlined function and its arguments in these
+// variables to communicate with the workers. Since they are in shared memory,
+// there is one copy of these variables for each kernel, instance, and team.
+//
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
+ omptarget_nvptx_workFn;
+extern __device__ __shared__ uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// init entry points
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned smid() {
+ unsigned id;
+ asm("mov.u32 %0, %%smid;" : "=r"(id));
+ return id;
+}
+
+EXTERN void __kmpc_kernel_init_params(void *Ptr) {
+ PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
+ OMPTARGET_NVPTX_VERSION);
+
+ SetTeamsReductionScratchpadPtr(Ptr);
+}
+
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
+ PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
+ OMPTARGET_NVPTX_VERSION);
+
+ if (!RequiresOMPRuntime) {
+ // If OMP runtime is not required don't initialize OMP state.
+ setExecutionParameters(Generic, RuntimeUninitialized);
+ return;
+ }
+ setExecutionParameters(Generic, RuntimeInitialized);
+
+ int threadIdInBlock = GetThreadIdInBlock();
+ ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
+ "__kmpc_kernel_init() must be called by team master warp only!");
+ PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
+
+ // Get a state object from the queue.
+ int slot = smid() % MAX_SM;
+ omptarget_nvptx_threadPrivateContext =
+ omptarget_nvptx_device_State[slot].Dequeue();
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
+#endif
+
+ // init thread private
+ int threadId = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
+
+ // init team context
+ omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+ currTeamDescr.InitTeamDescr();
+ // this thread will start execution... has to update its task ICV
+ // to point to the level zero task ICV. That ICV was init in
+ // InitTeamDescr()
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+ threadId, currTeamDescr.LevelZeroTaskDescr());
+
+ // set number of threads and thread limit in team to started value
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
+ currTaskDescr->ThreadLimit() = ThreadLimit;
+}
+
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
+ if (IsOMPRuntimeInitialized) {
+ // Enqueue omp state object for use by another team.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
+#else
+ int slot = smid() % MAX_SM;
+#endif
+ omptarget_nvptx_device_State[slot].Enqueue(
+ omptarget_nvptx_threadPrivateContext);
+ }
+ // Done with work. Kill the workers.
+ omptarget_nvptx_workFn = 0;
+}
+
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+ int16_t RequiresDataSharing) {
+ PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
+
+ if (!RequiresOMPRuntime) {
+ // If OMP runtime is not required don't initialize OMP state.
+ setExecutionParameters(Spmd, RuntimeUninitialized);
+ return;
+ }
+ setExecutionParameters(Spmd, RuntimeInitialized);
+
+ //
+ // Team Context Initialization.
+ //
+ // In SPMD mode there is no master thread so use any cuda thread for team
+ // context initialization.
+ int threadId = GetThreadIdInBlock();
+ if (threadId == 0) {
+ // Get a state object from the queue.
+ int slot = smid() % MAX_SM;
+ omptarget_nvptx_threadPrivateContext =
+ omptarget_nvptx_device_State[slot].Dequeue();
+
+ omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+ omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+ // init team context
+ currTeamDescr.InitTeamDescr();
+ // init counters (copy start to init)
+ workDescr.CounterGroup().Reset();
+ }
+ __syncthreads();
+
+ omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+ omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+
+ //
+ // Initialize task descr for each thread.
+ //
+ omptarget_nvptx_TaskDescr *newTaskDescr =
+ omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+ ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+ newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
+ currTeamDescr.LevelZeroTaskDescr());
+ newTaskDescr->ThreadLimit() = ThreadLimit;
+ // install new top descriptor
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+ newTaskDescr);
+
+ // init thread private from init value
+ workDescr.CounterGroup().Init(
+ omptarget_nvptx_threadPrivateContext->Priv(threadId));
+ PRINT(LD_PAR,
+ "thread will execute parallel region with id %d in a team of "
+ "%d threads\n",
+ newTaskDescr->ThreadId(), newTaskDescr->ThreadsInTeam());
+
+ if (RequiresDataSharing && threadId % WARPSIZE == 0) {
+ // Warp master innitializes data sharing environment.
+ unsigned WID = threadId / WARPSIZE;
+ __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
+ WID, WID == WARPSIZE - 1);
+ DataSharingState.SlotPtr[WID] = RootS;
+ DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+ }
+}
+
+EXTERN void __kmpc_spmd_kernel_deinit() {
+ // We're not going to pop the task descr stack of each thread since
+ // there are no more parallel regions in SPMD mode.
+ __syncthreads();
+ int threadId = GetThreadIdInBlock();
+ if (threadId == 0) {
+ // Enqueue omp state object for use by another team.
+ int slot = smid() % MAX_SM;
+ omptarget_nvptx_device_State[slot].Enqueue(
+ omptarget_nvptx_threadPrivateContext);
+ }
+}
+
+// Return true if the current target region is executed in SPMD mode.
+EXTERN int8_t __kmpc_is_spmd_exec_mode() {
+ return isSPMDMode();
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
new file mode 100644
index 0000000..84c61f9
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -0,0 +1,441 @@
+//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPTARGET_NVPTX_H
+#define __OMPTARGET_NVPTX_H
+
+// std includes
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <inttypes.h>
+
+// cuda includes
+#include <cuda.h>
+#include <math.h>
+
+// local includes
+#include "counter_group.h"
+#include "debug.h" // debug
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "option.h" // choices we have
+#include "state-queue.h"
+#include "support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+// Macros for Cuda intrinsics
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
+ __shfl_down_sync((mask), (var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
+#define __ACTIVEMASK() __activemask()
+#else
+#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
+ __shfl_down((var), (delta), (width))
+#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
+#define __ACTIVEMASK() __ballot(1)
+#endif
+
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+ // All these methods must be called by the master thread only.
+ INLINE void Init() {
+ args = buffer;
+ nArgs = MAX_SHARED_ARGS;
+ }
+ INLINE void DeInit() {
+ // Free any memory allocated for outlined parallel function with a large
+ // number of arguments.
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ Init();
+ }
+ }
+ INLINE void EnsureSize(size_t size) {
+ if (size > nArgs) {
+ if (nArgs > MAX_SHARED_ARGS) {
+ SafeFree(args, (char *)"new extended args");
+ }
+ args = (void **) SafeMalloc(size * sizeof(void *),
+ (char *)"new extended args");
+ nArgs = size;
+ }
+ }
+ // Called by all threads.
+ INLINE void **GetArgs() { return args; };
+private:
+ // buffer of pre-allocated arguments.
+ void *buffer[MAX_SHARED_ARGS];
+ // pointer to arguments buffer.
+ // starts off as a pointer to 'buffer' but can be dynamically allocated.
+ void **args;
+ // starts off as MAX_SHARED_ARGS but can increase in size.
+ uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+ // The maximum number of workers in a kernel.
+ DS_Max_Worker_Threads = 992,
+ // The size reserved for data in a shared memory slot.
+ DS_Slot_Size = 256,
+ // The slot size that should be reserved for a working warp.
+ DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+ // The maximum number of warps in use
+ DS_Max_Warp_Number = 32,
+};
+
+// Data structure to keep in shared memory that traces the current slot, stack,
+// and frame pointer as well as the active threads that didn't exit the current
+// environment.
+struct DataSharingStateTy {
+ __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
+ void *StackPtr[DS_Max_Warp_Number];
+ void *FramePtr[DS_Max_Warp_Number];
+ int32_t ActiveThreads[DS_Max_Warp_Number];
+};
+// Additional worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_worker_slot_static {
+ __kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
+ void *PrevSlotStackPtr;
+ void *DataEnd;
+ char Data[DS_Worker_Warp_Slot_Size];
+};
+// Additional master slot type which is initialized with the default master slot
+// size of 4 bytes.
+struct __kmpc_data_sharing_master_slot_static {
+ __kmpc_data_sharing_slot *Next;
+ __kmpc_data_sharing_slot *Prev;
+ void *PrevSlotStackPtr;
+ void *DataEnd;
+ char Data[DS_Slot_Size];
+};
+extern __device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+public:
+ // methods for flags
+ INLINE omp_sched_t GetRuntimeSched();
+ INLINE void SetRuntimeSched(omp_sched_t sched);
+ INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; }
+ INLINE void SetDynamic() {
+ items.flags = items.flags | TaskDescr_IsDynamic;
+ }
+ INLINE void ClearDynamic() {
+ items.flags = items.flags & (~TaskDescr_IsDynamic);
+ }
+ INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; }
+ INLINE int InL2OrHigherParallelRegion() {
+ return items.flags & TaskDescr_InParL2P;
+ }
+ INLINE int IsParallelConstruct() {
+ return items.flags & TaskDescr_IsParConstr;
+ }
+ INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
+ // methods for other fields
+ INLINE uint16_t &NThreads() { return items.nthreads; }
+ INLINE uint16_t &ThreadLimit() { return items.threadlimit; }
+ INLINE uint16_t &ThreadId() { return items.threadId; }
+ INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
+ INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
+ INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
+ INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
+ prev = taskDescr;
+ }
+ // init & copy
+ INLINE void InitLevelZeroTaskDescr();
+ INLINE void InitLevelOneTaskDescr(uint16_t tnum,
+ omptarget_nvptx_TaskDescr *parentTaskDescr);
+ INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+ INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+ INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+ INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+ INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
+ uint16_t tnum);
+ INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+ INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
+ uint16_t tid, uint16_t tnum);
+ INLINE void SaveLoopData();
+ INLINE void RestoreLoopData() const;
+
+private:
+ // bits for flags: (7 used, 1 free)
+ // 3 bits (SchedMask) for runtime schedule
+ // 1 bit (IsDynamic) for dynamic schedule (false = static)
+ // 1 bit (InPar) if this thread has encountered one or more parallel region
+ // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+ // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
+ // region
+ static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
+ static const uint8_t TaskDescr_IsDynamic = 0x8;
+ static const uint8_t TaskDescr_InPar = 0x10;
+ static const uint8_t TaskDescr_IsParConstr = 0x20;
+ static const uint8_t TaskDescr_InParL2P = 0x40;
+
+ struct SavedLoopDescr_items {
+ int64_t loopUpperBound;
+ int64_t nextLowerBound;
+ int64_t chunk;
+ int64_t stride;
+ kmp_sched_t schedule;
+ } loopData;
+
+ struct TaskDescr_items {
+ uint8_t flags; // 6 bit used (see flag above)
+ uint8_t unused;
+ uint16_t nthreads; // thread num for subsequent parallel regions
+ uint16_t threadlimit; // thread limit ICV
+ uint16_t threadId; // thread id
+ uint16_t threadsInTeam; // threads in current team
+ uint64_t runtimeChunkSize; // runtime chunk size
+ } items;
+ omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+ omptarget_nvptx_TaskDescr
+ taskDescr; // omptarget_nvptx task description (must be first)
+ kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+public:
+ // access to data
+ INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; }
+ INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
+ // init
+ INLINE void InitWorkDescr();
+
+private:
+ omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
+ omptarget_nvptx_TaskDescr masterTaskICV;
+ bool hasCancel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class omptarget_nvptx_TeamDescr {
+public:
+ // access to data
+ INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
+ return &levelZeroTaskDescr;
+ }
+ INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
+ return workDescrForActiveParallel;
+ }
+ INLINE omp_lock_t *CriticalLock() { return &criticalLock; }
+ INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
+
+ // init
+ INLINE void InitTeamDescr();
+
+ INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
+ // If this is invoked by the master thread of the master warp then intialize
+ // it with a smaller slot.
+ if (IsMasterThread) {
+ // Do not initalize this slot again if it has already been initalized.
+ if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
+ return 0;
+ // Initialize the pointer to the end of the slot given the size of the
+ // data section. DataEnd is non-inclusive.
+ master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
+ // We currently do not have a next slot.
+ master_rootS[0].Next = 0;
+ master_rootS[0].Prev = 0;
+ master_rootS[0].PrevSlotStackPtr = 0;
+ return (__kmpc_data_sharing_slot *)&master_rootS[0];
+ }
+ // Do not initalize this slot again if it has already been initalized.
+ if (worker_rootS[wid].DataEnd ==
+ &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
+ return 0;
+ // Initialize the pointer to the end of the slot given the size of the data
+ // section. DataEnd is non-inclusive.
+ worker_rootS[wid].DataEnd =
+ &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+ // We currently do not have a next slot.
+ worker_rootS[wid].Next = 0;
+ worker_rootS[wid].Prev = 0;
+ worker_rootS[wid].PrevSlotStackPtr = 0;
+ return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+ }
+
+ INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+ worker_rootS[wid].DataEnd =
+ &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+ // We currently do not have a next slot.
+ worker_rootS[wid].Next = 0;
+ worker_rootS[wid].Prev = 0;
+ worker_rootS[wid].PrevSlotStackPtr = 0;
+ return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+ }
+
+private:
+ omptarget_nvptx_TaskDescr
+ levelZeroTaskDescr; // icv for team master initial thread
+ omptarget_nvptx_WorkDescr
+ workDescrForActiveParallel; // one, ONLY for the active par
+ omp_lock_t criticalLock;
+ uint64_t lastprivateIterBuffer;
+
+ __align__(16)
+ __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+ __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+ // task
+ INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
+ return &levelOneTaskDescr[tid];
+ }
+ INLINE void SetTopLevelTaskDescr(int tid,
+ omptarget_nvptx_TaskDescr *taskICV) {
+ topTaskDescr[tid] = taskICV;
+ }
+ INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
+ // parallel
+ INLINE uint16_t &NumThreadsForNextParallel(int tid) {
+ return nextRegion.tnum[tid];
+ }
+ // simd
+ INLINE uint16_t &SimdLimitForNextSimd(int tid) {
+ return nextRegion.slim[tid];
+ }
+ // sync
+ INLINE Counter &Priv(int tid) { return priv[tid]; }
+ INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; }
+ // schedule (for dispatch)
+ INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
+ INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
+ INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
+ INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+ INLINE int64_t &Stride(int tid) { return stride[tid]; }
+
+ INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
+
+ INLINE void InitThreadPrivateContext(int tid);
+ INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
+ INLINE uint64_t GetSourceQueue() { return SourceQueue; }
+
+private:
+ // team context for this team
+ omptarget_nvptx_TeamDescr teamContext;
+ // task ICV for implict threads in the only parallel region
+ omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
+ // pointer where to find the current task ICV (top of the stack)
+ omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
+ union {
+ // Only one of the two is live at the same time.
+ // parallel
+ uint16_t tnum[MAX_THREADS_PER_TEAM];
+ // simd limit
+ uint16_t slim[MAX_THREADS_PER_TEAM];
+ } nextRegion;
+ // sync
+ Counter priv[MAX_THREADS_PER_TEAM];
+ // schedule (for dispatch)
+ kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
+ int64_t chunk[MAX_THREADS_PER_TEAM];
+ int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
+ // state for dispatch with dyn/guided OR static (never use both at a time)
+ int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+ int64_t stride[MAX_THREADS_PER_TEAM];
+ // Queue to which this object must be returned.
+ uint64_t SourceQueue;
+};
+
+/// Device envrionment data
+struct omptarget_device_environmentTy {
+ int32_t debug_level;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// global device envrionment
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ __shared__
+ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern __device__ __shared__ uint32_t execution_param;
+extern __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// work function (outlined parallel/simd functions) and arguments.
+// needed for L1 parallelism only.
+////////////////////////////////////////////////////////////////////////////////
+
+typedef void *omptarget_nvptx_WorkFn;
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
+ omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "counter_groupi.h"
+#include "omptarget-nvptxi.h"
+#include "supporti.h"
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
new file mode 100644
index 0000000..086f4c5
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -0,0 +1,218 @@
+//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
+ // sched starts from 1..4; encode it as 0..3; so add 1 here
+ uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
+ return (omp_sched_t)rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+ // sched starts from 1..4; encode it as 0..3; so sub 1 here
+ uint8_t val = ((uint8_t)sched) - 1;
+ // clear current sched
+ items.flags &= ~TaskDescr_SchedMask;
+ // set new sched
+ items.flags |= val;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+ // slow method
+ // flag:
+ // default sched is static,
+ // dyn is off (unused now anyway, but may need to sample from host ?)
+ // not in parallel
+
+ items.flags = 0;
+ items.nthreads = GetNumberOfProcsInTeam();
+ ; // threads: whatever was alloc by kernel
+ items.threadId = 0; // is master
+ items.threadsInTeam = 1; // sequential
+ items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+}
+
+// This is called when all threads are started together in SPMD mode.
+// OMP directives include target parallel, target distribute parallel for, etc.
+INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
+ uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
+ // slow method
+ // flag:
+ // default sched is static,
+ // dyn is off (unused now anyway, but may need to sample from host ?)
+ // in L1 parallel
+
+ items.flags =
+ TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+ items.nthreads = 0; // # threads for subsequent parallel region
+ items.threadId =
+ GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+ items.threadsInTeam = tnum;
+ items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+ prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(
+ omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+ items = sourceTaskDescr->items;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+ CopyData(sourceTaskDescr);
+ prev = sourceTaskDescr->prev;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(
+ omptarget_nvptx_TaskDescr *parentTaskDescr) {
+ CopyData(parentTaskDescr);
+ prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
+ omptarget_nvptx_TaskDescr *parentTaskDescr) {
+ CopyParent(parentTaskDescr);
+ items.flags = items.flags & ~TaskDescr_IsParConstr;
+ ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+ omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
+ CopyParent(masterTaskDescr);
+ // overrwrite specific items;
+ items.flags |=
+ TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+ items.threadsInTeam = tnum; // set number of threads
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
+ omptarget_nvptx_TaskDescr *workTaskDescr) {
+ Copy(workTaskDescr);
+ //
+ // overrwrite specific items;
+ //
+ // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
+ // This is so that the serial master (first lane in the master warp)
+ // gets a threadId of 0.
+ // However, we know that this function is always called in a parallel
+ // region where only workers are active. The serial master thread
+ // never enters this region. When a parallel region is executed serially,
+ // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
+ // are called, which never activate this region.
+ items.threadId =
+ GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
+ omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
+ CopyParent(parentTaskDescr);
+ items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
+ items.threadsInTeam = tnum; // set number of threads
+ items.threadId = tid;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+ loopData.loopUpperBound =
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+ loopData.nextLowerBound =
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+ loopData.schedule =
+ omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+ loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+ loopData.stride =
+ omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+ omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+ omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+ loopData.loopUpperBound;
+ omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+ loopData.nextLowerBound;
+ omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+ loopData.stride;
+ omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+ loopData.schedule;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
+ ASSERT0(
+ LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
+ "Getting top level, tid is larger than allocated data structure size");
+ return topTaskDescr[tid];
+}
+
+INLINE void
+omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
+ // levelOneTaskDescr is init when starting the parallel region
+ // top task descr is NULL (team master version will be fixed separately)
+ topTaskDescr[tid] = NULL;
+ // no num threads value has been pushed
+ nextRegion.tnum[tid] = 0;
+ // priv counter init to zero
+ priv[tid] = 0;
+ // the following don't need to be init here; they are init when using dyn
+ // sched
+ // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Work Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() {
+ cg.Clear(); // start and stop to zero too
+ // threadsInParallelTeam does not need to be init (done in start parallel)
+ hasCancel = FALSE;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+ levelZeroTaskDescr.InitLevelZeroTaskDescr();
+ workDescrForActiveParallel.InitWorkDescr();
+ // omp_init_lock(criticalLock);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
+ return omptarget_nvptx_threadPrivateContext->TeamContext();
+}
+
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
+ omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+ return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
+ return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
+ return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/option.h b/final/libomptarget/deviceRTLs/nvptx/src/option.h
new file mode 100644
index 0000000..43172ad
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -0,0 +1,70 @@
+//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// GPU default options
+//
+//===----------------------------------------------------------------------===//
+#ifndef _OPTION_H_
+#define _OPTION_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 32
+
+// The named barrier for active parallel threads of a team in an L1 parallel
+// region to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#if __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#define MAX_SM 56
+#else
+#define OMP_STATE_COUNT 16
+#define MAX_SM 16
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// algo options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// data options
+////////////////////////////////////////////////////////////////////////////////
+
+// decide if counters are 32 or 64 bit
+#define Counter unsigned long long
+
+////////////////////////////////////////////////////////////////////////////////
+// misc options (by def everythig here is device)
+////////////////////////////////////////////////////////////////////////////////
+
+#define EXTERN extern "C" __device__
+#define INLINE __inline__ __device__
+#define NOINLINE __noinline__ __device__
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu
new file mode 100644
index 0000000..33509b6
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -0,0 +1,479 @@
+//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implemention in the GPU. Here is the pattern:
+//
+// while (not finished) {
+//
+// if (master) {
+// sequential code, decide which par loop to do, or if finished
+// __kmpc_kernel_prepare_parallel() // exec by master only
+// }
+// syncthreads // A
+// __kmpc_kernel_parallel() // exec by all
+// if (this thread is included in the parallel) {
+// switch () for all parallel loops
+// __kmpc_kernel_end_parallel() // exec only by threads in parallel
+// }
+//
+//
+// The reason we don't exec end_parallel for the threads not included
+// in the parallel loop is that for each barrier in the parallel
+// region, these non-included threads will cycle through the
+// syncthread A. Thus they must preserve their current threadId that
+// is larger than thread in team.
+//
+// To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+typedef struct ConvergentSimdJob {
+ omptarget_nvptx_TaskDescr taskDescr;
+ omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+ uint16_t slimForNextSimd;
+} ConvergentSimdJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent simd (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
+ bool *IsFinal, int32_t *LaneSource,
+ int32_t *LaneId, int32_t *NumLanes) {
+ PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
+ uint32_t ConvergentMask = Mask;
+ int32_t ConvergentSize = __popc(ConvergentMask);
+ uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+ *LaneSource += __ffs(WorkRemaining);
+ *IsFinal = __popc(WorkRemaining) == 1;
+ uint32_t lanemask_lt;
+ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+ *LaneId = __popc(ConvergentMask & lanemask_lt);
+
+ int threadId = GetLogicalThreadIdInBlock();
+ int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+ ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+ int32_t SimdLimit =
+ omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
+ job->slimForNextSimd = SimdLimit;
+
+ int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+ // reset simdlimit to avoid propagating to successive #simd
+ if (SimdLimitSource > 0 && threadId == sourceThreadId)
+ omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
+
+ // We cannot have more than the # of convergent threads.
+ if (SimdLimitSource > 0)
+ *NumLanes = min(ConvergentSize, SimdLimitSource);
+ else
+ *NumLanes = ConvergentSize;
+ ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
+ *NumLanes);
+
+ // Set to true for lanes participating in the simd region.
+ bool isActive = false;
+ // Initialize state for active threads.
+ if (*LaneId < *NumLanes) {
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ omptarget_nvptx_TaskDescr *sourceTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+ sourceThreadId);
+ job->convHeadTaskDescr = currTaskDescr;
+ // install top descriptor from the thread for which the lanes are working.
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+ sourceTaskDescr);
+ isActive = true;
+ }
+
+ // requires a memory fence between threads of a warp
+ return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
+ PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+ // pop stack
+ int threadId = GetLogicalThreadIdInBlock();
+ ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+ omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
+ job->slimForNextSimd;
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+ threadId, job->convHeadTaskDescr);
+}
+
+typedef struct ConvergentParallelJob {
+ omptarget_nvptx_TaskDescr taskDescr;
+ omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+ uint16_t tnumForNextPar;
+} ConvergentParallelJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent parallelism (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
+ bool *IsFinal,
+ int32_t *LaneSource) {
+ PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
+ uint32_t ConvergentMask = Mask;
+ int32_t ConvergentSize = __popc(ConvergentMask);
+ uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+ *LaneSource += __ffs(WorkRemaining);
+ *IsFinal = __popc(WorkRemaining) == 1;
+ uint32_t lanemask_lt;
+ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+ uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
+
+ int threadId = GetLogicalThreadIdInBlock();
+ int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+ ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+ int32_t NumThreadsClause =
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+ job->tnumForNextPar = NumThreadsClause;
+
+ int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+ // reset numthreads to avoid propagating to successive #parallel
+ if (NumThreadsSource > 0 && threadId == sourceThreadId)
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+ 0;
+
+ // We cannot have more than the # of convergent threads.
+ uint16_t NumThreads;
+ if (NumThreadsSource > 0)
+ NumThreads = min(ConvergentSize, NumThreadsSource);
+ else
+ NumThreads = ConvergentSize;
+ ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+ NumThreads);
+
+ // Set to true for workers participating in the parallel region.
+ bool isActive = false;
+ // Initialize state for active threads.
+ if (OmpId < NumThreads) {
+ // init L2 task descriptor and storage for the L1 parallel task descriptor.
+ omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
+ ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ omptarget_nvptx_TaskDescr *sourceTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+ sourceThreadId);
+ job->convHeadTaskDescr = currTaskDescr;
+ newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
+ // install new top descriptor
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+ newTaskDescr);
+ isActive = true;
+ }
+
+ // requires a memory fence between threads of a warp
+ return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
+ PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+ // pop stack
+ int threadId = GetLogicalThreadIdInBlock();
+ ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+ threadId, job->convHeadTaskDescr);
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+ job->tnumForNextPar;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes parallel (1 static level only)
+////////////////////////////////////////////////////////////////////////////////
+
+// return number of cuda threads that participate to parallel
+// calculation has to consider simd implementation in nvptx
+// i.e. (num omp threads * num lanes)
+//
+// cudathreads =
+// if(num_threads != 0) {
+// if(thread_limit > 0) {
+// min (num_threads*numLanes ; thread_limit*numLanes);
+// } else {
+// min (num_threads*numLanes; blockDim.x)
+// }
+// } else {
+// if (thread_limit != 0) {
+// min (thread_limit*numLanes; blockDim.x)
+// } else { // no thread_limit, no num_threads, use all cuda threads
+// blockDim.x;
+// }
+// }
+//
+// This routine is always called by the team master..
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ int16_t IsOMPRuntimeInitialized) {
+ PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
+ omptarget_nvptx_workFn = WorkFn;
+
+ if (!IsOMPRuntimeInitialized)
+ return;
+
+ // This routine is only called by the team master. The team master is
+ // the first thread of the last warp. It always has the logical thread
+ // id of 0 (since it is a shadow for the first worker thread).
+ int threadId = 0;
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+ ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
+ "cannot be called in a parallel region.");
+ if (currTaskDescr->InParallelRegion()) {
+ PRINT0(LD_PAR, "already in parallel: go seq\n");
+ return;
+ }
+
+ uint16_t CudaThreadsForParallel = 0;
+ uint16_t NumThreadsClause =
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+
+ // we cannot have more than block size
+ uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
+
+ // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
+ // specified by the thread_limit clause on the target directive.
+ // GetNumberOfWorkersInTeam(): This is the number of workers available
+ // in this kernel instance.
+ //
+ // E.g: If thread_limit is 33, the kernel is launched with 33+32=65
+ // threads. The last warp is the master warp so in this case
+ // GetNumberOfWorkersInTeam() returns 64.
+
+ // this is different from ThreadAvail of OpenMP because we may be
+ // using some of the CUDA threads as SIMD lanes
+ int NumLanes = 1;
+ if (NumThreadsClause != 0) {
+ // reset request to avoid propagating to successive #parallel
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+ 0;
+
+ // assume that thread_limit*numlanes is already <= CudaThreadsAvail
+ // because that is already checked on the host side (CUDA offloading rtl)
+ if (currTaskDescr->ThreadLimit() != 0)
+ CudaThreadsForParallel =
+ NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
+ ? NumThreadsClause * NumLanes
+ : currTaskDescr->ThreadLimit() * NumLanes;
+ else {
+ CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
+ ? CudaThreadsAvail
+ : NumThreadsClause * NumLanes;
+ }
+ } else {
+ if (currTaskDescr->ThreadLimit() != 0) {
+ CudaThreadsForParallel =
+ (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
+ ? CudaThreadsAvail
+ : currTaskDescr->ThreadLimit() * NumLanes;
+ } else
+ CudaThreadsForParallel = CudaThreadsAvail;
+ }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ // On Volta and newer architectures we require that all lanes in
+ // a warp participate in the parallel region. Round down to a
+ // multiple of WARPSIZE since it is legal to do so in OpenMP.
+ // CudaThreadsAvail is the number of workers available in this
+ // kernel instance and is greater than or equal to
+ // currTaskDescr->ThreadLimit().
+ if (CudaThreadsForParallel < CudaThreadsAvail) {
+ CudaThreadsForParallel =
+ (CudaThreadsForParallel < WARPSIZE)
+ ? 1
+ : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
+ }
+#endif
+
+ ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
+ "bad thread request of %d threads", CudaThreadsForParallel);
+ ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+ "only team master can create parallel");
+
+ // set number of threads on work descriptor
+ // this is different from the number of cuda threads required for the parallel
+ // region
+ omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+ workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
+ CudaThreadsForParallel / NumLanes);
+ // init counters (copy start to init)
+ workDescr.CounterGroup().Reset();
+}
+
+// All workers call this function. Deactivate those not needed.
+// Fn - the outlined work function to execute.
+// returns True if this thread is active, else False.
+//
+// Only the worker threads call this routine.
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+ int16_t IsOMPRuntimeInitialized) {
+ PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+
+ // Work function and arguments for L1 parallel region.
+ *WorkFn = omptarget_nvptx_workFn;
+
+ if (!IsOMPRuntimeInitialized)
+ return true;
+
+ // If this is the termination signal from the master, quit early.
+ if (!*WorkFn)
+ return false;
+
+ // Only the worker threads call this routine and the master warp
+ // never arrives here. Therefore, use the nvptx thread id.
+ int threadId = GetThreadIdInBlock();
+ omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+ // Set to true for workers participating in the parallel region.
+ bool isActive = false;
+ // Initialize state for active threads.
+ if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
+ // init work descriptor from workdesccr
+ omptarget_nvptx_TaskDescr *newTaskDescr =
+ omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+ ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+ newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
+ // install new top descriptor
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+ newTaskDescr);
+ // init private from int value
+ workDescr.CounterGroup().Init(
+ omptarget_nvptx_threadPrivateContext->Priv(threadId));
+ PRINT(LD_PAR,
+ "thread will execute parallel region with id %d in a team of "
+ "%d threads\n",
+ newTaskDescr->ThreadId(), newTaskDescr->NThreads());
+
+ isActive = true;
+ }
+
+ return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_parallel() {
+ // pop stack
+ PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+ // Only the worker threads call this routine and the master warp
+ // never arrives here. Therefore, use the nvptx thread id.
+ int threadId = GetThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+ threadId, currTaskDescr->GetPrevTaskDescr());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes sequential
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+
+ // assume this is only called for nested parallel
+ int threadId = GetLogicalThreadIdInBlock();
+
+ // unlike actual parallel, threads in the same team do not share
+ // the workTaskDescr in this case and num threads is fixed to 1
+
+ // get current task
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+ currTaskDescr->SaveLoopData();
+
+ // allocate new task descriptor and copy value from current one, set prev to
+ // it
+ omptarget_nvptx_TaskDescr *newTaskDescr =
+ (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
+ (char *)"new seq parallel task");
+ newTaskDescr->CopyParent(currTaskDescr);
+
+ // tweak values for serialized parallel case:
+ // - each thread becomes ID 0 in its serialized parallel, and
+ // - there is only one thread per team
+ newTaskDescr->ThreadId() = 0;
+ newTaskDescr->ThreadsInTeam() = 1;
+
+ // set new task descriptor as top
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+ newTaskDescr);
+}
+
+EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
+ uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+
+ // pop stack
+ int threadId = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+ // set new top
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+ threadId, currTaskDescr->GetPrevTaskDescr());
+ // free
+ SafeFree(currTaskDescr, (char *)"new seq parallel task");
+ currTaskDescr = getMyTopTaskDescriptor(threadId);
+ currTaskDescr->RestoreLoopData();
+}
+
+EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
+
+ int threadId = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ if (currTaskDescr->InL2OrHigherParallelRegion())
+ return 2;
+ else if (currTaskDescr->InParallelRegion())
+ return 1;
+ else
+ return 0;
+}
+
+// This kmpc call returns the thread id across all teams. It's value is
+// cached by the compiler and used when calling the runtime. On nvptx
+// it's cheap to recalculate this value so we never use the result
+// of this call.
+EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
+ return GetLogicalThreadIdInBlock();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// push params
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
+ int32_t num_threads) {
+ PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+ tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
+ num_threads;
+}
+
+EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
+ int32_t simd_limit) {
+ PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
+ tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
+}
+
+// Do nothing. The host guarantees we started the requested number of
+// teams and we only need inspection of gridDim.
+
+EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
+ int32_t num_teams, int32_t thread_limit) {
+ PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
+ ASSERT0(LT_FUSSY, FALSE,
+ "should never have anything with new teams on device");
+}
+
+EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
+ int proc_bind) {
+ PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu
new file mode 100644
index 0000000..b813a11
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -0,0 +1,429 @@
+//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
+//-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <complex.h>
+#include <stdio.h>
+
+#include "omptarget-nvptx.h"
+
+// may eventually remove this
+EXTERN
+int32_t __gpu_block_reduce() {
+ int tid = GetLogicalThreadIdInBlock();
+ int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+ if (nt != blockDim.x)
+ return 0;
+ unsigned tnum = __ACTIVEMASK();
+ if (tnum != (~0x0)) // assume swapSize is 32
+ return 0;
+ return 1;
+}
+
+EXTERN
+int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
+ size_t reduce_size, void *reduce_data,
+ void *reduce_array_size, kmp_ReductFctPtr *reductFct,
+ kmp_CriticalName *lck) {
+ int threadId = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+ int numthread;
+ if (currTaskDescr->IsParallelConstruct()) {
+ numthread =
+ GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized());
+ } else {
+ numthread = GetNumberOfOmpTeams();
+ }
+
+ if (numthread == 1)
+ return 1;
+ if (!__gpu_block_reduce())
+ return 2;
+ if (threadIdx.x == 0)
+ return 1;
+ return 0;
+}
+
+EXTERN
+int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
+ return threadIdx.x == 0 ? 2 : 0;
+}
+
+EXTERN
+int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
+ return (threadIdx.x % 32 == 0) ? 1 : 0;
+}
+
+EXTERN
+void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
+
+EXTERN
+void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
+
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+ return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
+}
+
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+ int lo, hi;
+ asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+ hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
+ lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
+ asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+ return val;
+}
+
+static INLINE void gpu_regular_warp_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct) {
+ for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
+ shflFct(reduce_data, /*LaneId - not used= */ 0,
+ /*Offset = */ mask, /*AlgoVersion=*/0);
+ }
+}
+
+static INLINE void gpu_irregular_warp_reduce(void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct,
+ uint32_t size, uint32_t tid) {
+ uint32_t curr_size;
+ uint32_t mask;
+ curr_size = size;
+ mask = curr_size / 2;
+ while (mask > 0) {
+ shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+ curr_size = (curr_size + 1) / 2;
+ mask = curr_size / 2;
+ }
+}
+
+static INLINE uint32_t
+gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
+ uint32_t lanemask_lt;
+ uint32_t lanemask_gt;
+ uint32_t size, remote_id, physical_lane_id;
+ physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+ uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
+ asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
+ do {
+ Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ remote_id = __ffs(Liveness & lanemask_gt);
+ size = __popc(Liveness);
+ logical_lane_id /= 2;
+ shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+ /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+ } while (logical_lane_id % 2 == 0 && size > 1);
+ return (logical_lane_id == 0);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
+ size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct) {
+ uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ if (Liveness == 0xffffffff) {
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ return GetThreadIdInBlock() % WARPSIZE ==
+ 0; // Result on lane 0 of the simd warp.
+ } else {
+ return gpu_irregular_simd_reduce(
+ reduce_data, shflFct); // Result on the first active lane.
+ }
+}
+
+INLINE
+int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
+ size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct,
+ bool isSPMDExecutionMode,
+ bool isRuntimeUninitialized = false) {
+ uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+ uint32_t NumThreads = GetNumberOfOmpThreads(
+ BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+ if (NumThreads == 1)
+ return 1;
+ /*
+ * This reduce function handles reduction within a team. It handles
+ * parallel regions in both L1 and L2 parallelism levels. It also
+ * supports Generic, SPMD, and NoOMP modes.
+ *
+ * 1. Reduce within a warp.
+ * 2. Warp master copies value to warp 0 via shared memory.
+ * 3. Warp 0 reduces to a single value.
+ * 4. The reduced value is available in the thread that returns 1.
+ */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+ uint32_t WarpId = BlockThreadId / WARPSIZE;
+
+ // Volta execution model:
+ // For the Generic execution mode a parallel region either has 1 thread and
+ // beyond that, always a multiple of 32. For the SPMD execution mode we may
+ // have any number of threads.
+ if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/NumThreads % WARPSIZE,
+ /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+
+ // When we have more than [warpsize] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+ if (NumThreads > WARPSIZE) {
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
+
+ return BlockThreadId == 0;
+ }
+ return BlockThreadId == 0;
+#else
+ uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ if (Liveness == 0xffffffff) // Full warp
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/__popc(Liveness),
+ /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+ else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
+ // parallel region may enter here; return
+ // early.
+ return gpu_irregular_simd_reduce(reduce_data, shflFct);
+
+ // When we have more than [warpsize] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+ if (NumThreads > WARPSIZE) {
+ uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ uint32_t WarpId = BlockThreadId / WARPSIZE;
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
+
+ return BlockThreadId == 0;
+ } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
+ return BlockThreadId == 0;
+ }
+
+ // Get the OMP thread Id. This is different from BlockThreadId in the case of
+ // an L2 parallel region.
+ return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode,
+ isRuntimeUninitialized) == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+ return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct,
+ /*isSPMDExecutionMode=*/isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+ return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct,
+ /*isSPMDExecutionMode=*/true,
+ /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+ return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct,
+ /*isSPMDExecutionMode=*/false,
+ /*isRuntimeUninitialized=*/true);
+}
+
+INLINE
+int32_t nvptx_teams_reduce_nowait(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
+ bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) {
+ uint32_t ThreadId = GetLogicalThreadIdInBlock();
+ // In non-generic mode all workers participate in the teams reduction.
+ // In generic mode only the team master participates in the teams
+ // reduction because the workers are waiting for parallel work.
+ uint32_t NumThreads =
+ isSPMDExecutionMode
+ ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
+ isRuntimeUninitialized)
+ : /*Master thread only*/ 1;
+ uint32_t TeamId = GetBlockIdInKernel();
+ uint32_t NumTeams = GetNumberOfBlocksInKernel();
+ __shared__ volatile bool IsLastTeam;
+
+ // Team masters of all teams write to the scratchpad.
+ if (ThreadId == 0) {
+ unsigned int *timestamp = GetTeamsReductionTimestamp();
+ char *scratchpad = GetTeamsReductionScratchpad();
+
+ scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
+ __threadfence();
+
+ // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
+ // It resets 'timestamp' back to 0 once the last team increments
+ // this counter.
+ unsigned val = atomicInc(timestamp, NumTeams - 1);
+ IsLastTeam = val == NumTeams - 1;
+ }
+
+ // We have to wait on L1 barrier because in GENERIC mode the workers
+ // are waiting on barrier 0 for work.
+ //
+ // If we guard this barrier as follows it leads to deadlock, probably
+ // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
+ uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+ named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
+
+ // If this team is not the last, quit.
+ if (/* Volatile read by all threads */ !IsLastTeam)
+ return 0;
+
+ //
+ // Last team processing.
+ //
+
+ // Threads in excess of #teams do not participate in reduction of the
+ // scratchpad values.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ uint32_t ActiveThreads = NumThreads;
+ if (NumTeams < NumThreads) {
+ ActiveThreads =
+ (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
+ }
+ if (ThreadId >= ActiveThreads)
+ return 0;
+
+ // Load from scratchpad and reduce.
+ char *scratchpad = GetTeamsReductionScratchpad();
+ ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+ for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
+ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+ uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+ uint32_t WarpId = ThreadId / WARPSIZE;
+
+ // Reduce across warps to the warp master.
+ if ((ActiveThreads % WARPSIZE == 0) ||
+ (WarpId < WarpsNeeded - 1)) // Full warp
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (ActiveThreads > 1) // Partial warp but contiguous lanes
+ // Only SPMD execution mode comes thru this case.
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/ActiveThreads % WARPSIZE,
+ /*LaneId=*/ThreadId % WARPSIZE);
+
+ // When we have more than [warpsize] number of threads
+ // a block reduction is performed here.
+ if (ActiveThreads > WARPSIZE) {
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+ }
+#else
+ if (ThreadId >= NumTeams)
+ return 0;
+
+ // Load from scratchpad and reduce.
+ char *scratchpad = GetTeamsReductionScratchpad();
+ ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+ for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+ ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+ // Reduce across warps to the warp master.
+ uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
+ if (Liveness == 0xffffffff) // Full warp
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else // Partial warp but contiguous lanes
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/__popc(Liveness),
+ /*LaneId=*/ThreadId % WARPSIZE);
+
+ // When we have more than [warpsize] number of threads
+ // a block reduction is performed here.
+ uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
+ if (ActiveThreads > WARPSIZE) {
+ uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ uint32_t WarpId = ThreadId / WARPSIZE;
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+ }
+#endif // __CUDA_ARCH__ >= 700
+
+ return ThreadId == 0;
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+ size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr scratchFct,
+ kmp_LoadReduceFctPtr ldFct) {
+ return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct, scratchFct,
+ ldFct, /*isSPMDExecutionMode=*/isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+ return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct, scratchFct,
+ ldFct,
+ /*isSPMDExecutionMode=*/true,
+ /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+ int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+ kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+ kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+ return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+ reduce_data, shflFct, cpyFct, scratchFct,
+ ldFct,
+ /*isSPMDExecutionMode=*/false,
+ /*isRuntimeUninitialized=*/true);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h
new file mode 100644
index 0000000..accb1f7
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h
@@ -0,0 +1,52 @@
+//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a queue to hand out OpenMP state objects to teams of
+// one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __STATE_QUEUE_H
+#define __STATE_QUEUE_H
+
+#include <stdint.h>
+
+#include "option.h" // choices we have
+
+template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
+private:
+ ElementType elements[SIZE];
+ volatile ElementType *elementQueue[SIZE];
+ volatile uint32_t head;
+ volatile uint32_t ids[SIZE];
+ volatile uint32_t tail;
+
+ static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
+ INLINE uint32_t ENQUEUE_TICKET();
+ INLINE uint32_t DEQUEUE_TICKET();
+ INLINE uint32_t ID(uint32_t ticket);
+ INLINE bool IsServing(uint32_t slot, uint32_t id);
+ INLINE void PushElement(uint32_t slot, ElementType *element);
+ INLINE ElementType *PopElement(uint32_t slot);
+ INLINE void DoneServing(uint32_t slot, uint32_t id);
+
+public:
+ INLINE omptarget_nvptx_Queue(){};
+ INLINE void Enqueue(ElementType *element);
+ INLINE ElementType *Dequeue();
+};
+
+#include "state-queuei.h"
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
new file mode 100644
index 0000000..c9ffd54
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
@@ -0,0 +1,89 @@
+//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of a queue to hand out OpenMP state
+// objects to teams of one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "state-queue.h"
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
+ return atomicAdd((unsigned int *)&tail, 1);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
+ return atomicAdd((unsigned int *)&head, 1);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
+ return (ticket / SIZE) * 2;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
+ uint32_t id) {
+ return atomicAdd((unsigned int *)&ids[slot], 0) == id;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
+ ElementType *element) {
+ atomicExch((unsigned long long *)&elementQueue[slot],
+ (unsigned long long)element);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *
+omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
+ return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
+ (unsigned long long)0);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
+ uint32_t id) {
+ atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
+ uint32_t ticket = ENQUEUE_TICKET();
+ uint32_t slot = ticket % SIZE;
+ uint32_t id = ID(ticket) + 1;
+ while (!IsServing(slot, id))
+ ;
+ PushElement(slot, element);
+ DoneServing(slot, id);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
+ uint32_t ticket = DEQUEUE_TICKET();
+ uint32_t slot = ticket % SIZE;
+ uint32_t id = ID(ticket);
+ while (!IsServing(slot, id))
+ ;
+ ElementType *element = PopElement(slot);
+ // This is to populate the queue because of the lack of GPU constructors.
+ if (element == 0)
+ element = &elements[slot];
+ DoneServing(slot, id);
+ return element;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/support.h b/final/libomptarget/deviceRTLs/nvptx/src/support.h
new file mode 100644
index 0000000..44298f4
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -0,0 +1,92 @@
+//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+enum ExecutionMode {
+ Generic = 0x00u,
+ Spmd = 0x01u,
+ ModeMask = 0x01u,
+};
+
+enum RuntimeMode {
+ RuntimeInitialized = 0x00u,
+ RuntimeUninitialized = 0x02u,
+ RuntimeMask = 0x02u,
+};
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+// get low level ids of resources
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+
+// get global ids to locate tread/team info (constant regardless of OMP)
+INLINE int GetLogicalThreadIdInBlock();
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
+
+// get OpenMP thread and team ids
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
+ bool isRuntimeUninitialized); // omp_thread_num
+INLINE int GetOmpTeamId(); // omp_team_num
+
+// get OpenMP number of threads and team
+INLINE int
+GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
+ bool isRuntimeUninitialized); // omp_num_threads
+INLINE int GetNumberOfOmpTeams(); // omp_num_teams
+
+// get OpenMP number of procs
+INLINE int GetNumberOfProcsInTeam();
+INLINE int GetNumberOfProcsInDevice();
+
+// masters
+INLINE int IsTeamMaster(int ompThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+// safe alloc and free
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
+// pad to a alignment (power of 2 only)
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+#define ADD_BYTES(_addr, _bytes) \
+ ((void *)((char *)((void *)(_addr)) + (_bytes)))
+#define SUB_BYTES(_addr, _bytes) \
+ ((void *)((char *)((void *)(_addr)) - (_bytes)))
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+INLINE void named_sync(const int barrier, const int num_threads);
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/supporti.h b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h
new file mode 100644
index 0000000..4de2039
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -0,0 +1,215 @@
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper implementation to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+ execution_param = EMode;
+ execution_param |= RMode;
+}
+
+INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+
+INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+
+INLINE bool isRuntimeUninitialized() {
+ return (execution_param & RuntimeMask) == RuntimeUninitialized;
+}
+
+INLINE bool isRuntimeInitialized() {
+ return (execution_param & RuntimeMask) == RuntimeInitialized;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support: get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the NVPTX layer (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetThreadIdInBlock() { return threadIdx.x; }
+
+INLINE int GetBlockIdInKernel() { return blockIdx.x; }
+
+INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+
+INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The master thread id is the first thread (lane) of the last warp.
+// Thread id is 0 indexed.
+// E.g: If NumThreads is 33, master id is 32.
+// If NumThreads is 64, master id is 32.
+// If NumThreads is 97, master id is 96.
+// If NumThreads is 1024, master id is 992.
+//
+// Called in Generic Execution Mode only.
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
+
+// The last warp is reserved for the master; other warps are workers.
+// Called in Generic Execution Mode only.
+INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get thread id in team
+
+// This function may be called in a parallel region by the workers
+// or a serial region by the master. If the master (whose CUDA thread
+// id is GetMasterThreadID()) calls this routine, we return 0 because
+// it is a shadow for the first worker.
+INLINE int GetLogicalThreadIdInBlock() {
+ // return GetThreadIdInBlock() % GetMasterThreadID();
+
+ // Implemented using control flow (predication) instead of with a modulo
+ // operation.
+ int tid = GetThreadIdInBlock();
+ if (isGenericMode() && tid >= GetMasterThreadID())
+ return 0;
+ else
+ return tid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP Thread Support Layer
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
+ bool isRuntimeUninitialized) {
+ // omp_thread_num
+ int rc;
+
+ if (isRuntimeUninitialized) {
+ rc = GetThreadIdInBlock();
+ if (!isSPMDExecutionMode && rc >= GetMasterThreadID())
+ rc = 0;
+ } else {
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ rc = currTaskDescr->ThreadId();
+ }
+ return rc;
+}
+
+INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
+ bool isRuntimeUninitialized) {
+ // omp_num_threads
+ int rc;
+
+ if (isRuntimeUninitialized) {
+ rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock()
+ : GetNumberOfThreadsInBlock() - WARPSIZE;
+ } else {
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+ ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+ rc = currTaskDescr->ThreadsInTeam();
+ }
+
+ return rc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team id linked to OpenMP
+
+INLINE int GetOmpTeamId() {
+ // omp_team_num
+ return GetBlockIdInKernel(); // assume 1 block per team
+}
+
+INLINE int GetNumberOfOmpTeams() {
+ // omp_num_teams
+ return GetNumberOfBlocksInKernel(); // assume 1 block per team
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Masters
+
+INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get OpenMP number of procs
+
+// Get the number of processors in the device.
+INLINE int GetNumberOfProcsInDevice() {
+ if (isGenericMode())
+ return GetNumberOfWorkersInTeam();
+ return GetNumberOfThreadsInBlock();
+}
+
+INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned long PadBytes(unsigned long size,
+ unsigned long alignment) // must be a power of 2
+{
+ // compute the necessary padding to satisfy alignment constraint
+ ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
+ "alignment %ld is not a power of 2\n", alignment);
+ return (~(unsigned long)size + 1) & (alignment - 1);
+}
+
+INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+{
+ void *ptr = malloc(size);
+ PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
+ ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg);
+ return ptr;
+}
+
+INLINE void *SafeFree(void *ptr, const char *msg) {
+ PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg);
+ free(ptr);
+ return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void named_sync(const int barrier, const int num_threads) {
+ asm volatile("bar.sync %0, %1;"
+ :
+ : "r"(barrier), "r"(num_threads)
+ : "memory");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned int *GetTeamsReductionTimestamp() {
+ return static_cast<unsigned int *>(ReductionScratchpadPtr);
+}
+
+INLINE char *GetTeamsReductionScratchpad() {
+ return static_cast<char *>(ReductionScratchpadPtr) + 256;
+}
+
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
+ ReductionScratchpadPtr = ScratchpadPtr;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/sync.cu b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu
new file mode 100644
index 0000000..7e55df8
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -0,0 +1,153 @@
+//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Ordered calls
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_ordered\n");
+}
+
+EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_end_ordered\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Barriers
+////////////////////////////////////////////////////////////////////////////////
+
+// a team is a block: we can use CUDA native synchronization mechanism
+// FIXME: what if not all threads (warps) participate to the barrier?
+// We may need to implement it differently
+
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
+ PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
+ __kmpc_barrier(loc_ref, tid);
+ PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
+ return 0;
+}
+
+EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
+ if (isRuntimeUninitialized()) {
+ if (isSPMDMode())
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ else
+ __kmpc_barrier_simple_generic(loc_ref, tid);
+ } else {
+ tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+ int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
+ tid, isSPMDMode(), /*isRuntimeUninitialized=*/false);
+ if (numberOfActiveOMPThreads > 1) {
+ if (isSPMDMode()) {
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ } else {
+ // The #threads parameter must be rounded up to the WARPSIZE.
+ int threads =
+ WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+ PRINT(LD_SYNC,
+ "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+ numberOfActiveOMPThreads, threads);
+ // Barrier #1 is for synchronization among active threads.
+ named_sync(L1_BARRIER, threads);
+ }
+ } // numberOfActiveOMPThreads > 1
+ PRINT0(LD_SYNC, "completed kmpc_barrier\n");
+ }
+}
+
+// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) {
+ PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
+ __syncthreads();
+ PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
+}
+
+// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) {
+ int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
+ // The #threads parameter must be rounded up to the WARPSIZE.
+ int threads =
+ WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+ PRINT(LD_SYNC,
+ "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
+ "%d\n",
+ numberOfActiveOMPThreads, threads);
+ // Barrier #1 is for synchronization among active threads.
+ named_sync(L1_BARRIER, threads);
+ PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP MASTER
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int32_t IsMaster() {
+ // only the team master updates the state
+ int tid = GetLogicalThreadIdInBlock();
+ int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+ return IsTeamMaster(ompThreadId);
+}
+
+EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_master\n");
+ return IsMaster();
+}
+
+EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_end_master\n");
+ ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP SINGLE
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_single\n");
+ // decide to implement single with master; master get the single
+ return IsMaster();
+}
+
+EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) {
+ PRINT0(LD_IO, "call kmpc_end_single\n");
+ // decide to implement single with master: master get the single
+ ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
+ // sync barrier is explicitely called... so that is not a problem
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Flush
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_flush(kmp_Indent *loc) {
+ PRINT0(LD_IO, "call kmpc_flush\n");
+ __threadfence_block();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Vote
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_warp_active_thread_mask() {
+ PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
+ return __ACTIVEMASK();
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/task.cu b/final/libomptarget/deviceRTLs/nvptx/src/task.cu
new file mode 100644
index 0000000..8d47967
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -0,0 +1,208 @@
+//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+// explicit task structure uses
+// omptarget_nvptx task
+// kmp_task
+//
+// where kmp_task is
+// - klegacy_TaskDescr <- task pointer
+// shared -> X
+// routine
+// part_id
+// descr
+// - private (of size given by task_alloc call). Accessed by
+// task+sizeof(klegacy_TaskDescr)
+// * private data *
+// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
+// * pointer table to shared variables *
+// - end
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
+ kmp_Indent *loc, // unused
+ uint32_t global_tid, // unused
+ int32_t flag, // unused (because in our impl, all are immediately exec
+ size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
+ kmp_TaskFctPtr taskSub) {
+ PRINT(LD_IO,
+ "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
+ "fct 0x%llx)\n",
+ P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub));
+ // want task+priv to be a multiple of 8 bytes
+ size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
+ sizeOfTaskInclPrivate += padForTaskInclPriv;
+ size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
+ ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
+ "need task descr of size %d to be a multiple of %d\n",
+ sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
+ size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
+ omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+ (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
+ totSize, "explicit task descriptor");
+ kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
+ ASSERT0(LT_FUSSY,
+ (uint64_t)newKmpTaskDescr ==
+ (uint64_t)ADD_BYTES(newExplicitTaskDescr,
+ sizeof(omptarget_nvptx_TaskDescr)),
+ "bad size assumptions");
+ // init kmp_TaskDescr
+ newKmpTaskDescr->sharedPointerTable =
+ (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
+ newKmpTaskDescr->sub = taskSub;
+ newKmpTaskDescr->destructors = NULL;
+ PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
+ P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
+
+ return newKmpTaskDescr;
+}
+
+EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr) {
+ return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
+ 0);
+}
+
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum,
+ void *noAliasDepList) {
+ PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
+ P64(newKmpTaskDescr));
+ // 1. get explict task descr from kmp task descr
+ omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+ (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+ newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+ ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+ "bad assumptions");
+ omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+ ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+ "bad assumptions");
+
+ // 2. push new context: update new task descriptor
+ int tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+ newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+ // set new task descriptor as top
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+
+ // 3. call sub
+ PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
+ P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
+ newKmpTaskDescr->sub(0, newKmpTaskDescr);
+ PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
+ P64(newKmpTaskDescr->sub));
+
+ // 4. pop context
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+ parentTaskDescr);
+ // 5. free
+ SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+ return 0;
+}
+
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr) {
+ PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
+ P64(newKmpTaskDescr));
+ // 1. get explict task descr from kmp task descr
+ omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+ (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+ newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+ ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+ "bad assumptions");
+ omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+ ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+ "bad assumptions");
+
+ // 2. push new context: update new task descriptor
+ int tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+ newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+ // set new task descriptor as top
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+ // 3... noting to call... is inline
+ // 4 & 5 ... done in complete
+}
+
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr) {
+ PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
+ P64(newKmpTaskDescr));
+ // 1. get explict task descr from kmp task descr
+ omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+ (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+ newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+ ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+ "bad assumptions");
+ omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+ ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+ "bad assumptions");
+ // 2. get parent
+ omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
+ // 3... noting to call... is inline
+ // 4. pop context
+ int tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+ parentTaskDescr);
+ // 5. free
+ SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+}
+
+EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList) {
+ PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
+ // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
+ // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
+ // nothing to do as all our tasks are executed as final
+}
+
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
+ int end_part) {
+ PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
+ // do nothing: tasks are executed immediately, no yielding allowed
+ return 0;
+}
+
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) {
+ PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
+ // nothing to do as all our tasks are executed as final
+ return 0;
+}
+
+EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
+ kmp_TaskDescr *newKmpTaskDescr, int if_val,
+ uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+ int32_t sched, uint64_t grainsize, void *task_dup) {
+
+ // skip task entirely if empty iteration space
+ if (*lb > *ub)
+ return;
+
+ // the compiler has already stored lb and ub in the kmp_TaskDescr structure
+ // as we are using a single task to execute the entire loop, we can leave
+ // the initial task_t untouched
+
+ __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
+}
diff --git a/final/libomptarget/include/omptarget.h b/final/libomptarget/include/omptarget.h
new file mode 100644
index 0000000..e92a94b
--- /dev/null
+++ b/final/libomptarget/include/omptarget.h
@@ -0,0 +1,233 @@
+//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_H_
+#define _OMPTARGET_H_
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define OFFLOAD_SUCCESS (0)
+#define OFFLOAD_FAIL (~0)
+
+#define OFFLOAD_DEVICE_DEFAULT -1
+#define HOST_DEVICE -10
+
+/// Data attributes for each data reference used in an OpenMP target region.
+enum tgt_map_type {
+ // No flags
+ OMP_TGT_MAPTYPE_NONE = 0x000,
+ // copy data from host to device
+ OMP_TGT_MAPTYPE_TO = 0x001,
+ // copy data from device to host
+ OMP_TGT_MAPTYPE_FROM = 0x002,
+ // copy regardless of the reference count
+ OMP_TGT_MAPTYPE_ALWAYS = 0x004,
+ // force unmapping of data
+ OMP_TGT_MAPTYPE_DELETE = 0x008,
+ // map the pointer as well as the pointee
+ OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010,
+ // pass device base address to kernel
+ OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020,
+ // return base device address of mapped data
+ OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040,
+ // private variable - not mapped
+ OMP_TGT_MAPTYPE_PRIVATE = 0x080,
+ // copy by value - not mapped
+ OMP_TGT_MAPTYPE_LITERAL = 0x100,
+ // mapping is implicit
+ OMP_TGT_MAPTYPE_IMPLICIT = 0x200,
+ // member of struct, member given by [16 MSBs] - 1
+ OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000
+};
+
+enum OpenMPOffloadingDeclareTargetFlags {
+ /// Mark the entry as having a 'link' attribute.
+ OMP_DECLARE_TARGET_LINK = 0x01,
+ /// Mark the entry as being a global constructor.
+ OMP_DECLARE_TARGET_CTOR = 0x02,
+ /// Mark the entry as being a global destructor.
+ OMP_DECLARE_TARGET_DTOR = 0x04
+};
+
+/// This struct is a record of an entry point or global. For a function
+/// entry point the size is expected to be zero
+struct __tgt_offload_entry {
+ void *addr; // Pointer to the offload entry info (function or global)
+ char *name; // Name of the function or global
+ size_t size; // Size of the entry info (0 if it is a function)
+ int32_t flags; // Flags associated with the entry, e.g. 'link'.
+ int32_t reserved; // Reserved, to be used by the runtime library.
+};
+
+/// This struct is a record of the device image information
+struct __tgt_device_image {
+ void *ImageStart; // Pointer to the target code start
+ void *ImageEnd; // Pointer to the target code end
+ __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
+ __tgt_offload_entry *EntriesEnd; // End of table (non inclusive)
+};
+
+/// This struct is a record of all the host code that may be offloaded to a
+/// target.
+struct __tgt_bin_desc {
+ int32_t NumDeviceImages; // Number of device types supported
+ __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type)
+ __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
+ __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive)
+};
+
+/// This struct contains the offload entries identified by the target runtime
+struct __tgt_target_table {
+ __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
+ __tgt_offload_entry
+ *EntriesEnd; // End of the table with all the entries (non inclusive)
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int omp_get_num_devices(void);
+int omp_get_initial_device(void);
+void *omp_target_alloc(size_t size, int device_num);
+void omp_target_free(void *device_ptr, int device_num);
+int omp_target_is_present(void *ptr, int device_num);
+int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset,
+ size_t src_offset, int dst_device, int src_device);
+int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+ int num_dims, const size_t *volume, const size_t *dst_offsets,
+ const size_t *src_offsets, const size_t *dst_dimensions,
+ const size_t *src_dimensions, int dst_device, int src_device);
+int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size,
+ size_t device_offset, int device_num);
+int omp_target_disassociate_ptr(void *host_ptr, int device_num);
+
+/// adds a target shared library to the target execution image
+void __tgt_register_lib(__tgt_bin_desc *desc);
+
+/// removes a target shared library from the target execution image
+void __tgt_unregister_lib(__tgt_bin_desc *desc);
+
+// creates the host to target data mapping, stores it in the
+// libomptarget.so internal structure (an entry in a stack of data maps) and
+// passes the data to the device;
+void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types);
+void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args,
+ int64_t *arg_sizes, int64_t *arg_types,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum,
+ void *noAliasDepList);
+
+// passes data from the target, release target memory and destroys the
+// host-target mapping (top entry from the stack of data maps) created by
+// the last __tgt_target_data_begin
+void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base,
+ void **args, int64_t *arg_sizes, int64_t *arg_types);
+void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args,
+ int64_t *arg_sizes, int64_t *arg_types,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList);
+
+/// passes data to/from the target
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types);
+void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args,
+ int64_t *arg_sizes, int64_t *arg_types,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum,
+ void *noAliasDepList);
+
+// Performs the same actions as data_begin in case arg_num is non-zero
+// and initiates run of offloaded region on target platform; if arg_num
+// is non-zero after the region execution is done it also performs the
+// same action as data_end above. The following types are used; this
+// function returns 0 if it was able to transfer the execution to a
+// target and an int different from zero otherwise.
+int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types);
+int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types, int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList);
+
+int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types, int32_t num_teams,
+ int32_t thread_limit);
+int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+ int32_t arg_num, void **args_base, void **args,
+ int64_t *arg_sizes, int64_t *arg_types,
+ int32_t num_teams, int32_t thread_limit,
+ int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList);
+void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+#include <stdio.h>
+#define DEBUGP(prefix, ...) \
+ { \
+ fprintf(stderr, "%s --> ", prefix); \
+ fprintf(stderr, __VA_ARGS__); \
+ }
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#define DPxMOD "0x%0*" PRIxPTR
+#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))
+
+/*
+ * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
+ * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
+ *
+ * DPxMOD expands to:
+ * "0x%0*" PRIxPTR
+ * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
+ * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
+ * "0x%0*lu"
+ *
+ * Ultimately, the whole statement expands to:
+ * printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument
+ * // specifying the width of the output
+ * (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
+ * // 8 digits for 32bit systems
+ * // 16 digits for 64bit
+ * (uintptr_t) ptr);
+ */
+#else
+#define DEBUGP(prefix, ...) \
+ {}
+#endif
+
+#ifdef __cplusplus
+#define EXTERN extern "C"
+#else
+#define EXTERN extern
+#endif
+
+#endif // _OMPTARGET_H_
diff --git a/final/libomptarget/include/omptargetplugin.h b/final/libomptarget/include/omptargetplugin.h
new file mode 100644
index 0000000..35fa059
--- /dev/null
+++ b/final/libomptarget/include/omptargetplugin.h
@@ -0,0 +1,92 @@
+//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an interface between target independent OpenMP offload
+// runtime library libomptarget and target dependent plugin.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGETPLUGIN_H_
+#define _OMPTARGETPLUGIN_H_
+
+#include <omptarget.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return the number of available devices of the type supported by the
+// target RTL.
+int32_t __tgt_rtl_number_of_devices(void);
+
+// Return an integer different from zero if the provided device image can be
+// supported by the runtime. The functionality is similar to comparing the
+// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
+// lightweight query to determine if the RTL is suitable for an image without
+// having to load the library, which can be expensive.
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
+
+// Initialize the specified device. In case of success return 0; otherwise
+// return an error code.
+int32_t __tgt_rtl_init_device(int32_t ID);
+
+// Pass an executable image section described by image to the specified
+// device and prepare an address table of target entities. In case of error,
+// return NULL. Otherwise, return a pointer to the built address table.
+// Individual entries in the table may also be NULL, when the corresponding
+// offload region is not supported on the target device.
+__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
+ __tgt_device_image *Image);
+
+// Allocate data on the particular target device, of the specified size.
+// HostPtr is a address of the host data the allocated target data
+// will be associated with (HostPtr may be NULL if it is not known at
+// allocation time, like for example it would be for target data that
+// is allocated by omp_target_alloc() API). Return address of the
+// allocated data on the target that will be used by libomptarget.so to
+// initialize the target data mapping structures. These addresses are
+// used to generate a table of target variables to pass to
+// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
+// case an error occurred on the target device.
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
+
+// Pass the data content to the target device using the target address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
+ int64_t Size);
+
+// Retrieve the data content from the target device using its address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
+ int64_t Size);
+
+// De-allocate the data referenced by target ptr on the device. In case of
+// success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
+
+// Transfer control to the offloaded entry Entry on the target device.
+// Args and Offsets are arrays of NumArgs size of target addresses and
+// offsets. An offset should be added to the target address before passing it
+// to the outlined function on device side. In case of success, return zero.
+// Otherwise, return an error code.
+int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
+ ptrdiff_t *Offsets, int32_t NumArgs);
+
+// Similar to __tgt_rtl_run_target_region, but additionally specify the
+// number of teams to be created and a number of threads in each team.
+int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
+ ptrdiff_t *Offsets, int32_t NumArgs,
+ int32_t NumTeams, int32_t ThreadLimit,
+ uint64_t loop_tripcount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _OMPTARGETPLUGIN_H_
diff --git a/final/libomptarget/plugins/CMakeLists.txt b/final/libomptarget/plugins/CMakeLists.txt
new file mode 100644
index 0000000..8c3d571
--- /dev/null
+++ b/final/libomptarget/plugins/CMakeLists.txt
@@ -0,0 +1,72 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# - build a plugin for an ELF based generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+ if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+
+ libomptarget_say("Building ${tmachine_name} offloading plugin.")
+
+ include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+ include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+
+ # Define macro to be used as prefix of the runtime messages for this target.
+ add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+ # Define macro with the ELF ID for this target.
+ add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+ add_library("omptarget.rtl.${tmachine_libname}" SHARED
+ ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
+
+ # Install plugin under the lib destination folder.
+ install(TARGETS "omptarget.rtl.${tmachine_libname}"
+ LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+ target_link_libraries(
+ "omptarget.rtl.${tmachine_libname}"
+ ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+ ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+ dl
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+
+ # Report to the parent scope that we are building a plugin.
+ set(LIBOMPTARGET_SYSTEM_TARGETS
+ "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+
+ else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+ endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+ else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+ endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+else()
+ libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(aarch64)
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+
diff --git a/final/libomptarget/plugins/aarch64/CMakeLists.txt b/final/libomptarget/plugins/aarch64/CMakeLists.txt
new file mode 100644
index 0000000..e3a76b9
--- /dev/null
+++ b/final/libomptarget/plugins/aarch64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
+endif()
diff --git a/final/libomptarget/plugins/common/elf_common.c b/final/libomptarget/plugins/common/elf_common.c
new file mode 100644
index 0000000..dd85575
--- /dev/null
+++ b/final/libomptarget/plugins/common/elf_common.c
@@ -0,0 +1,73 @@
+//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common ELF functionality for target plugins.
+// Must be included in the plugin source file AFTER omptarget.h has been
+// included and macro DP(...) has been defined.
+// .
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(_OMPTARGET_H_) && defined(DP))
+#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
+ included and macro DP(...) has been defined.
+#endif
+
+#include <elf.h>
+#include <libelf.h>
+
+// Check whether an image is valid for execution on target_id
+static inline int32_t elf_check_machine(__tgt_device_image *image,
+ uint16_t target_id) {
+
+ // Is the library version incompatible with the header file?
+ if (elf_version(EV_CURRENT) == EV_NONE) {
+ DP("Incompatible ELF library!\n");
+ return 0;
+ }
+
+ char *img_begin = (char *)image->ImageStart;
+ char *img_end = (char *)image->ImageEnd;
+ size_t img_size = img_end - img_begin;
+
+ // Obtain elf handler
+ Elf *e = elf_memory(img_begin, img_size);
+ if (!e) {
+ DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+ return 0;
+ }
+
+ // Check if ELF is the right kind.
+ if (elf_kind(e) != ELF_K_ELF) {
+ DP("Unexpected ELF type!\n");
+ return 0;
+ }
+ Elf64_Ehdr *eh64 = elf64_getehdr(e);
+ Elf32_Ehdr *eh32 = elf32_getehdr(e);
+
+ if (!eh64 && !eh32) {
+ DP("Unable to get machine ID from ELF file!\n");
+ elf_end(e);
+ return 0;
+ }
+
+ uint16_t MachineID;
+ if (eh64 && !eh32)
+ MachineID = eh64->e_machine;
+ else if (eh32 && !eh64)
+ MachineID = eh32->e_machine;
+ else {
+ DP("Ambiguous ELF header!\n");
+ elf_end(e);
+ return 0;
+ }
+
+ elf_end(e);
+ return MachineID == target_id;
+}
diff --git a/final/libomptarget/plugins/cuda/CMakeLists.txt b/final/libomptarget/plugins/cuda/CMakeLists.txt
new file mode 100644
index 0000000..7210eec
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/CMakeLists.txt
@@ -0,0 +1,50 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+ libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.")
+ return()
+elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+ libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
+ return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+ libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
+ return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+ libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
+ return()
+endif()
+
+libomptarget_say("Building CUDA offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=CUDA)
+
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+ add_definitions(-DCUDA_ERROR_REPORT)
+endif()
+
+include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+
+add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+target_link_libraries(omptarget.rtl.cuda
+ ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+ ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+
+# Report to the parent scope that we are building a plugin for CUDA.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp
new file mode 100644
index 0000000..872e7f0
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/src/rtl.cpp
@@ -0,0 +1,758 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME CUDA
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+ do { \
+ if (DebugLevel > 0) { \
+ DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+ } \
+ } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+// Utility for retrieving and printing CUDA error string.
+#ifdef CUDA_ERROR_REPORT
+#define CUDA_ERR_STRING(err) \
+ do { \
+ const char *errStr; \
+ cuGetErrorString(err, &errStr); \
+ DP("CUDA error is: %s\n", errStr); \
+ } while (0)
+#else
+#define CUDA_ERR_STRING(err) \
+ {}
+#endif
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+ std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+ SPMD, // constructors, destructors,
+ // combined constructs (`teams distribute parallel for [simd]`)
+ GENERIC, // everything else
+ NONE
+};
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+ CUfunction Func;
+
+ // execution mode of kernel
+ // 0 - SPMD mode (without master warp)
+ // 1 - Generic mode (with master warp)
+ int8_t ExecutionMode;
+
+ KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+ : Func(_Func), ExecutionMode(_ExecutionMode) {}
+};
+
+/// Device envrionment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
+struct omptarget_device_environmentTy {
+ int32_t debug_level;
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ int NumberOfDevices;
+ std::vector<CUmodule> Modules;
+ std::vector<CUcontext> Contexts;
+
+ // Device properties
+ std::vector<int> ThreadsPerBlock;
+ std::vector<int> BlocksPerGrid;
+ std::vector<int> WarpSize;
+
+ // OpenMP properties
+ std::vector<int> NumTeams;
+ std::vector<int> NumThreads;
+
+ // OpenMP Environment properties
+ int EnvNumTeams;
+ int EnvTeamLimit;
+
+ //static int EnvNumThreads;
+ static const int HardTeamLimit = 1<<16; // 64k
+ static const int HardThreadLimit = 1024;
+ static const int DefaultNumTeams = 128;
+ static const int DefaultNumThreads = 128;
+
+ // Record entry point associated with device
+ void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Entries.push_back(entry);
+ }
+
+ // Return true if the entry is associated with device
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (auto &it : E.Entries) {
+ if (it.addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ int32_t size = E.Entries.size();
+
+ // Table is empty
+ if (!size)
+ return 0;
+
+ __tgt_offload_entry *begin = &E.Entries[0];
+ __tgt_offload_entry *end = &E.Entries[size - 1];
+
+ // Update table info according to the entries and return the pointer
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = ++end;
+
+ return &E.Table;
+ }
+
+ // Clear entries table for a device
+ void clearOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+ E.Entries.clear();
+ E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+ }
+
+ RTLDeviceInfoTy() {
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+ DebugLevel = std::stoi(envStr);
+ }
+#endif // OMPTARGET_DEBUG
+
+ DP("Start initializing CUDA\n");
+
+ CUresult err = cuInit(0);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when initializing CUDA\n");
+ CUDA_ERR_STRING(err);
+ return;
+ }
+
+ NumberOfDevices = 0;
+
+ err = cuDeviceGetCount(&NumberOfDevices);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when getting CUDA device count\n");
+ CUDA_ERR_STRING(err);
+ return;
+ }
+
+ if (NumberOfDevices == 0) {
+ DP("There are no devices supporting CUDA.\n");
+ return;
+ }
+
+ FuncGblEntries.resize(NumberOfDevices);
+ Contexts.resize(NumberOfDevices);
+ ThreadsPerBlock.resize(NumberOfDevices);
+ BlocksPerGrid.resize(NumberOfDevices);
+ WarpSize.resize(NumberOfDevices);
+ NumTeams.resize(NumberOfDevices);
+ NumThreads.resize(NumberOfDevices);
+
+ // Get environment variables regarding teams
+ char *envStr = getenv("OMP_TEAM_LIMIT");
+ if (envStr) {
+ // OMP_TEAM_LIMIT has been set
+ EnvTeamLimit = std::stoi(envStr);
+ DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+ } else {
+ EnvTeamLimit = -1;
+ }
+ envStr = getenv("OMP_NUM_TEAMS");
+ if (envStr) {
+ // OMP_NUM_TEAMS has been set
+ EnvNumTeams = std::stoi(envStr);
+ DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+ } else {
+ EnvNumTeams = -1;
+ }
+ }
+
+ ~RTLDeviceInfoTy() {
+ // Close modules
+ for (auto &module : Modules)
+ if (module) {
+ CUresult err = cuModuleUnload(module);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when unloading CUDA module\n");
+ CUDA_ERR_STRING(err);
+ }
+ }
+
+ // Destroy contexts
+ for (auto &ctx : Contexts)
+ if (ctx) {
+ CUresult err = cuCtxDestroy(ctx);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when destroying CUDA context\n");
+ CUDA_ERR_STRING(err);
+ }
+ }
+ }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+ return elf_check_machine(image, 190); // EM_CUDA = 190.
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+
+ CUdevice cuDevice;
+ DP("Getting device %d\n", device_id);
+ CUresult err = cuDeviceGet(&cuDevice, device_id);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when getting CUDA device with id = %d\n", device_id);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // Create the context and save it to use whenever this device is selected.
+ err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+ cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when creating a CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // scan properties to determine number of threads/block and blocks/grid.
+ CUdevprop Properties;
+ err = cuDeviceGetProperties(&Properties, cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting device Properties, use defaults\n");
+ DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+ DeviceInfo.WarpSize[device_id] = 32;
+ } else {
+ // Get blocks per grid
+ if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
+ DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
+ DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
+ } else {
+ DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+ DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+ "at the hard limit\n", Properties.maxGridSize[0],
+ RTLDeviceInfoTy::HardTeamLimit);
+ }
+
+ // Get threads per block, exploit threads only along x axis
+ if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
+ DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
+ DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
+ if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
+ DP("(fewer than max per block along all xyz dims %d)\n",
+ Properties.maxThreadsPerBlock);
+ }
+ } else {
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+ DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+ "capping at the hard limit\n", Properties.maxThreadsDim[0],
+ RTLDeviceInfoTy::HardThreadLimit);
+ }
+
+ // According to the documentation, SIMDWidth is "Warp size in threads".
+ DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
+ }
+
+ // Adjust teams to the env variables
+ if (DeviceInfo.EnvTeamLimit > 0 &&
+ DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
+ DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
+ DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+ DeviceInfo.EnvTeamLimit);
+ }
+
+ DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+ DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
+ DeviceInfo.WarpSize[device_id]);
+
+ // Set default number of teams
+ if (DeviceInfo.EnvNumTeams > 0) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+ DP("Default number of teams set according to environment %d\n",
+ DeviceInfo.EnvNumTeams);
+ } else {
+ DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+ DP("Default number of teams set according to library's default %d\n",
+ RTLDeviceInfoTy::DefaultNumTeams);
+ }
+ if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
+ DP("Default number of teams exceeds device limit, capping at %d\n",
+ DeviceInfo.BlocksPerGrid[device_id]);
+ }
+
+ // Set default number of threads
+ DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+ DP("Default number of threads set according to library's default %d\n",
+ RTLDeviceInfoTy::DefaultNumThreads);
+ if (DeviceInfo.NumThreads[device_id] >
+ DeviceInfo.ThreadsPerBlock[device_id]) {
+ DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
+ DP("Default number of threads exceeds device limit, capping at %d\n",
+ DeviceInfo.ThreadsPerBlock[device_id]);
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting a CUDA context for device %d\n", device_id);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ // Clear the offload table as we are going to create a new one.
+ DeviceInfo.clearOffloadEntriesTable(device_id);
+
+ // Create the module and extract the function pointers.
+
+ CUmodule cumod;
+ DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
+ err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when loading CUDA module\n");
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("CUDA module successfully loaded!\n");
+ DeviceInfo.Modules.push_back(cumod);
+
+ // Find the symbols in the module by name.
+ __tgt_offload_entry *HostBegin = image->EntriesBegin;
+ __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+ for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+ if (!e->addr) {
+ // We return NULL when something like this happens, the host should have
+ // always something in the address to uniquely identify the target region.
+ DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+
+ return NULL;
+ }
+
+ if (e->size) {
+ __tgt_offload_entry entry = *e;
+
+ CUdeviceptr cuptr;
+ size_t cusize;
+ err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+
+ if (err != CUDA_SUCCESS) {
+ DP("Loading global '%s' (Failed)\n", e->name);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ if (cusize != e->size) {
+ DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
+ cusize, e->size);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+ DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
+ entry.addr = (void *)cuptr;
+
+ DeviceInfo.addOffloadEntry(device_id, entry);
+
+ continue;
+ }
+
+ CUfunction fun;
+ err = cuModuleGetFunction(&fun, cumod, e->name);
+
+ if (err != CUDA_SUCCESS) {
+ DP("Loading '%s' (Failed)\n", e->name);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+ DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
+
+ // default value GENERIC (in case symbol is missing from cubin file)
+ int8_t ExecModeVal = ExecutionModeType::GENERIC;
+ std::string ExecModeNameStr (e->name);
+ ExecModeNameStr += "_exec_mode";
+ const char *ExecModeName = ExecModeNameStr.c_str();
+
+ CUdeviceptr ExecModePtr;
+ size_t cusize;
+ err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
+ if (err == CUDA_SUCCESS) {
+ if ((size_t)cusize != sizeof(int8_t)) {
+ DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+ ExecModeName, cusize, sizeof(int8_t));
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+ DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ if (ExecModeVal < 0 || ExecModeVal > 1) {
+ DP("Error wrong exec_mode value specified in cubin file: %d\n",
+ ExecModeVal);
+ return NULL;
+ }
+ } else {
+ DP("Loading global exec_mode '%s' - symbol missing, using default value "
+ "GENERIC (1)\n", ExecModeName);
+ CUDA_ERR_STRING(err);
+ }
+
+ KernelsList.push_back(KernelTy(fun, ExecModeVal));
+
+ __tgt_offload_entry entry = *e;
+ entry.addr = (void *)&KernelsList.back();
+ DeviceInfo.addOffloadEntry(device_id, entry);
+ }
+
+ // send device environment data to the device
+ {
+ omptarget_device_environmentTy device_env;
+
+ device_env.debug_level = 0;
+
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+ device_env.debug_level = std::stoi(envStr);
+ }
+#endif
+
+ const char * device_env_Name="omptarget_device_environment";
+ CUdeviceptr device_env_Ptr;
+ size_t cusize;
+
+ err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+
+ if (err == CUDA_SUCCESS) {
+ if ((size_t)cusize != sizeof(device_env)) {
+ DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+ device_env_Name, cusize, sizeof(int32_t));
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: "
+ "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+ DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
+ } else {
+ DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
+ DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
+ }
+ }
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+ if (size == 0) {
+ return NULL;
+ }
+
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error while trying to set CUDA current context\n");
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ CUdeviceptr ptr;
+ err = cuMemAlloc(&ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error while trying to allocate %d\n", err);
+ CUDA_ERR_STRING(err);
+ return NULL;
+ }
+
+ void *vptr = (void *)ptr;
+ return vptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+ DPxPTR(tgt_ptr), size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+ ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+ DPxPTR(tgt_ptr), size);
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ err = cuMemFree((CUdeviceptr)tgt_ptr);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when freeing CUDA memory\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount) {
+ // Set the context we are using.
+ CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+ if (err != CUDA_SUCCESS) {
+ DP("Error when setting CUDA context\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ // All args are references.
+ std::vector<void *> args(arg_num);
+ std::vector<void *> ptrs(arg_num);
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+ args[i] = &ptrs[i];
+ }
+
+ KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+ int cudaThreadsPerBlock;
+
+ if (thread_limit > 0) {
+ cudaThreadsPerBlock = thread_limit;
+ DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+ // Add master warp if necessary
+ if (KernelInfo->ExecutionMode == GENERIC) {
+ cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+ DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+ }
+ } else {
+ cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
+ DP("Setting CUDA threads per block to default %d\n",
+ DeviceInfo.NumThreads[device_id]);
+ }
+
+ if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
+ cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
+ DP("Threads per block capped at device limit %d\n",
+ DeviceInfo.ThreadsPerBlock[device_id]);
+ }
+
+ int kernel_limit;
+ err = cuFuncGetAttribute(&kernel_limit,
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
+ if (err == CUDA_SUCCESS) {
+ if (kernel_limit < cudaThreadsPerBlock) {
+ cudaThreadsPerBlock = kernel_limit;
+ DP("Threads per block capped at kernel limit %d\n", kernel_limit);
+ }
+ }
+
+ int cudaBlocksPerGrid;
+ if (team_num <= 0) {
+ if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
+ if (KernelInfo->ExecutionMode == SPMD) {
+ // We have a combined construct, i.e. `target teams distribute parallel
+ // for [simd]`. We launch so many teams so that each thread will
+ // execute one iteration of the loop.
+ // round up to the nearest integer
+ cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+ } else {
+ // If we reach this point, then we have a non-combined construct, i.e.
+ // `teams distribute` with a nested `parallel for` and each team is
+ // assigned one iteration of the `distribute` loop. E.g.:
+ //
+ // #pragma omp target teams distribute
+ // for(...loop_tripcount...) {
+ // #pragma omp parallel for
+ // for(...) {}
+ // }
+ //
+ // Threads within a team will execute the iterations of the `parallel`
+ // loop.
+ cudaBlocksPerGrid = loop_tripcount;
+ }
+ DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+ "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
+ cudaThreadsPerBlock);
+ } else {
+ cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
+ DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
+ }
+ } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
+ cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
+ DP("Capping number of teams to team limit %d\n",
+ DeviceInfo.BlocksPerGrid[device_id]);
+ } else {
+ cudaBlocksPerGrid = team_num;
+ DP("Using requested number of teams %d\n", team_num);
+ }
+
+ // Run on the device.
+ DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
+ cudaThreadsPerBlock);
+
+ err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
+ cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
+ if (err != CUDA_SUCCESS) {
+ DP("Device kernel launch failed!\n");
+ CUDA_ERR_STRING(err);
+ return OFFLOAD_FAIL;
+ }
+
+ DP("Launch of entry point at " DPxMOD " successful!\n",
+ DPxPTR(tgt_entry_ptr));
+
+ CUresult sync_err = cuCtxSynchronize();
+ if (sync_err != CUDA_SUCCESS) {
+ DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
+ CUDA_ERR_STRING(sync_err);
+ return OFFLOAD_FAIL;
+ } else {
+ DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+ // use one team and the default number of threads.
+ const int32_t team_num = 1;
+ const int32_t thread_limit = 0;
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, team_num, thread_limit, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/exports b/final/libomptarget/plugins/exports
new file mode 100644
index 0000000..3f9f7d4
--- /dev/null
+++ b/final/libomptarget/plugins/exports
@@ -0,0 +1,15 @@
+VERS1.0 {
+ global:
+ __tgt_rtl_is_valid_binary;
+ __tgt_rtl_number_of_devices;
+ __tgt_rtl_init_device;
+ __tgt_rtl_load_binary;
+ __tgt_rtl_data_alloc;
+ __tgt_rtl_data_submit;
+ __tgt_rtl_data_retrieve;
+ __tgt_rtl_data_delete;
+ __tgt_rtl_run_target_team_region;
+ __tgt_rtl_run_target_region;
+ local:
+ *;
+};
diff --git a/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
new file mode 100644
index 0000000..951710a
--- /dev/null
+++ b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -0,0 +1,340 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+ do { \
+ if (DebugLevel > 0) { \
+ DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+ } \
+ } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+#define NUMBER_OF_DEVICES 4
+#define OFFLOADSECTIONNAME ".omp_offloading.entries"
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+ char *FileName;
+ void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+ __tgt_target_table Table;
+};
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+ std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+ std::list<DynLibTy> DynLibs;
+
+ // Record entry point associated with device.
+ void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+ __tgt_offload_entry *end) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncGblEntries[device_id].emplace_back();
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ E.Table.EntriesBegin = begin;
+ E.Table.EntriesEnd = end;
+ }
+
+ // Return true if the entry is associated with device.
+ bool findOffloadEntry(int32_t device_id, void *addr) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+ i < e; ++i) {
+ if (i->addr == addr)
+ return true;
+ }
+
+ return false;
+ }
+
+ // Return the pointer to the target entries table.
+ __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+ assert(device_id < (int32_t)FuncGblEntries.size() &&
+ "Unexpected device id!");
+ FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+ return &E.Table;
+ }
+
+ RTLDeviceInfoTy(int32_t num_devices) {
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+ DebugLevel = std::stoi(envStr);
+ }
+#endif // OMPTARGET_DEBUG
+
+ FuncGblEntries.resize(num_devices);
+ }
+
+ ~RTLDeviceInfoTy() {
+ // Close dynamic libraries
+ for (auto &lib : DynLibs) {
+ if (lib.Handle) {
+ dlclose(lib.Handle);
+ remove(lib.FileName);
+ }
+ }
+ }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+ return 0;
+#else
+ return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+ __tgt_device_image *image) {
+
+ DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+ DPxPTR(image->ImageStart));
+
+ assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+ size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+ size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+ DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+ // Is the library version incompatible with the header file?
+ if (elf_version(EV_CURRENT) == EV_NONE) {
+ DP("Incompatible ELF library!\n");
+ return NULL;
+ }
+
+ // Obtain elf handler
+ Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+ if (!e) {
+ DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+ return NULL;
+ }
+
+ if (elf_kind(e) != ELF_K_ELF) {
+ DP("Invalid Elf kind!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ // Find the entries section offset
+ Elf_Scn *section = 0;
+ Elf64_Off entries_offset = 0;
+
+ size_t shstrndx;
+
+ if (elf_getshdrstrndx(e, &shstrndx)) {
+ DP("Unable to get ELF strings index!\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ while ((section = elf_nextscn(e, section))) {
+ GElf_Shdr hdr;
+ gelf_getshdr(section, &hdr);
+
+ if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+ entries_offset = hdr.sh_addr;
+ break;
+ }
+ }
+
+ if (!entries_offset) {
+ DP("Entries Section Offset Not Found\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+ // load dynamic library and get the entry points. We use the dl library
+ // to do the loading of the library, but we could do it directly to avoid the
+ // dump to the temporary file.
+ //
+ // 1) Create tmp file with the library contents.
+ // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+ char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+ int tmp_fd = mkstemp(tmp_name);
+
+ if (tmp_fd == -1) {
+ elf_end(e);
+ return NULL;
+ }
+
+ FILE *ftmp = fdopen(tmp_fd, "wb");
+
+ if (!ftmp) {
+ elf_end(e);
+ return NULL;
+ }
+
+ fwrite(image->ImageStart, ImageSize, 1, ftmp);
+ fclose(ftmp);
+
+ DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)};
+
+ if (!Lib.Handle) {
+ DP("Target library loading error: %s\n", dlerror());
+ elf_end(e);
+ return NULL;
+ }
+
+ DeviceInfo.DynLibs.push_back(Lib);
+
+ struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+ // The place where the entries info is loaded is the library base address
+ // plus the offset determined from the ELF file.
+ Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+ DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+ DPxPTR(entries_addr));
+
+ // Table of pointers to all the entries in the target.
+ __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+ __tgt_offload_entry *entries_begin = &entries_table[0];
+ __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+ if (!entries_begin) {
+ DP("Can't obtain entries begin\n");
+ elf_end(e);
+ return NULL;
+ }
+
+ DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+ DPxPTR(entries_begin), DPxPTR(entries_end));
+ DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+ elf_end(e);
+
+ return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+ void *ptr = malloc(size);
+ return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+ int64_t size) {
+ memcpy(tgt_ptr, hst_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+ int64_t size) {
+ memcpy(hst_ptr, tgt_ptr, size);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+ free(tgt_ptr);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+ int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+ // ignore team num and thread limit.
+
+ // Use libffi to launch execution.
+ ffi_cif cif;
+
+ // All args are references.
+ std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
+ std::vector<void *> args(arg_num);
+ std::vector<void *> ptrs(arg_num);
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+ args[i] = &ptrs[i];
+ }
+
+ ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
+ &ffi_type_void, &args_types[0]);
+
+ assert(status == FFI_OK && "Unable to prepare target launch!");
+
+ if (status != FFI_OK)
+ return OFFLOAD_FAIL;
+
+ DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+ void (*entry)(void);
+ *((void**) &entry) = tgt_entry_ptr;
+ ffi_call(&cif, entry, NULL, &args[0]);
+ return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+ void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+ // use one team and one thread.
+ return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+ tgt_offsets, arg_num, 1, 1, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/ppc64/CMakeLists.txt b/final/libomptarget/plugins/ppc64/CMakeLists.txt
new file mode 100644
index 0000000..6849a03
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
+endif() \ No newline at end of file
diff --git a/final/libomptarget/plugins/ppc64le/CMakeLists.txt b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
new file mode 100644
index 0000000..87cefdf
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
+endif() \ No newline at end of file
diff --git a/final/libomptarget/plugins/x86_64/CMakeLists.txt b/final/libomptarget/plugins/x86_64/CMakeLists.txt
new file mode 100644
index 0000000..bdd5bba
--- /dev/null
+++ b/final/libomptarget/plugins/x86_64/CMakeLists.txt
@@ -0,0 +1,18 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+ build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
+endif() \ No newline at end of file
diff --git a/final/libomptarget/src/CMakeLists.txt b/final/libomptarget/src/CMakeLists.txt
new file mode 100644
index 0000000..be099f3
--- /dev/null
+++ b/final/libomptarget/src/CMakeLists.txt
@@ -0,0 +1,31 @@
+##===----------------------------------------------------------------------===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is dual licensed under the MIT and the University of Illinois Open
+# Source Licenses. See LICENSE.txt for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library libomptarget.so.
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building offloading runtime library libomptarget.")
+
+set(src_files
+ api.cpp
+ device.cpp
+ interface.cpp
+ rtl.cpp
+ omptarget.cpp
+)
+
+# Build libomptarget library with libdl dependency.
+add_library(omptarget SHARED ${src_files})
+target_link_libraries(omptarget
+ ${CMAKE_DL_LIBS}
+ "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+
+# Install libomptarget under the lib destination folder.
+install(TARGETS omptarget LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
diff --git a/final/libomptarget/src/api.cpp b/final/libomptarget/src/api.cpp
new file mode 100644
index 0000000..15c1d2c
--- /dev/null
+++ b/final/libomptarget/src/api.cpp
@@ -0,0 +1,283 @@
+//===----------- api.cpp - Target independent OpenMP target RTL -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OpenMP API interface functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <climits>
+#include <cstring>
+#include <cstdlib>
+
+EXTERN int omp_get_num_devices(void) {
+ RTLsMtx.lock();
+ size_t Devices_size = Devices.size();
+ RTLsMtx.unlock();
+
+ DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
+
+ return Devices_size;
+}
+
+EXTERN int omp_get_initial_device(void) {
+ DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
+ return HOST_DEVICE;
+}
+
+EXTERN void *omp_target_alloc(size_t size, int device_num) {
+ DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
+ device_num, size);
+
+ if (size <= 0) {
+ DP("Call to omp_target_alloc with non-positive length\n");
+ return NULL;
+ }
+
+ void *rc = NULL;
+
+ if (device_num == omp_get_initial_device()) {
+ rc = malloc(size);
+ DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
+ return rc;
+ }
+
+ if (!device_is_ready(device_num)) {
+ DP("omp_target_alloc returns NULL ptr\n");
+ return NULL;
+ }
+
+ DeviceTy &Device = Devices[device_num];
+ rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
+ DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
+ return rc;
+}
+
+EXTERN void omp_target_free(void *device_ptr, int device_num) {
+ DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
+ device_num, DPxPTR(device_ptr));
+
+ if (!device_ptr) {
+ DP("Call to omp_target_free with NULL ptr\n");
+ return;
+ }
+
+ if (device_num == omp_get_initial_device()) {
+ free(device_ptr);
+ DP("omp_target_free deallocated host ptr\n");
+ return;
+ }
+
+ if (!device_is_ready(device_num)) {
+ DP("omp_target_free returns, nothing to do\n");
+ return;
+ }
+
+ DeviceTy &Device = Devices[device_num];
+ Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
+ DP("omp_target_free deallocated device ptr\n");
+}
+
+EXTERN int omp_target_is_present(void *ptr, int device_num) {
+ DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
+ device_num, DPxPTR(ptr));
+
+ if (!ptr) {
+ DP("Call to omp_target_is_present with NULL ptr, returning false\n");
+ return false;
+ }
+
+ if (device_num == omp_get_initial_device()) {
+ DP("Call to omp_target_is_present on host, returning true\n");
+ return true;
+ }
+
+ RTLsMtx.lock();
+ size_t Devices_size = Devices.size();
+ RTLsMtx.unlock();
+ if (Devices_size <= (size_t)device_num) {
+ DP("Call to omp_target_is_present with invalid device ID, returning "
+ "false\n");
+ return false;
+ }
+
+ DeviceTy& Device = Devices[device_num];
+ bool IsLast; // not used
+ int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
+ DP("Call to omp_target_is_present returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
+ size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
+ DP("Call to omp_target_memcpy, dst device %d, src device %d, "
+ "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+ "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
+ DPxPTR(src), dst_offset, src_offset, length);
+
+ if (!dst || !src || length <= 0) {
+ DP("Call to omp_target_memcpy with invalid arguments\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
+ DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
+ DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+ return OFFLOAD_FAIL;
+ }
+
+ int rc = OFFLOAD_SUCCESS;
+ void *srcAddr = (char *)src + src_offset;
+ void *dstAddr = (char *)dst + dst_offset;
+
+ if (src_device == omp_get_initial_device() &&
+ dst_device == omp_get_initial_device()) {
+ DP("copy from host to host\n");
+ const void *p = memcpy(dstAddr, srcAddr, length);
+ if (p == NULL)
+ rc = OFFLOAD_FAIL;
+ } else if (src_device == omp_get_initial_device()) {
+ DP("copy from host to device\n");
+ DeviceTy& DstDev = Devices[dst_device];
+ rc = DstDev.data_submit(dstAddr, srcAddr, length);
+ } else if (dst_device == omp_get_initial_device()) {
+ DP("copy from device to host\n");
+ DeviceTy& SrcDev = Devices[src_device];
+ rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
+ } else {
+ DP("copy from device to device\n");
+ void *buffer = malloc(length);
+ DeviceTy& SrcDev = Devices[src_device];
+ DeviceTy& DstDev = Devices[dst_device];
+ rc = SrcDev.data_retrieve(buffer, srcAddr, length);
+ if (rc == OFFLOAD_SUCCESS)
+ rc = DstDev.data_submit(dstAddr, buffer, length);
+ }
+
+ DP("omp_target_memcpy returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+ int num_dims, const size_t *volume, const size_t *dst_offsets,
+ const size_t *src_offsets, const size_t *dst_dimensions,
+ const size_t *src_dimensions, int dst_device, int src_device) {
+ DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
+ "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+ "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+ "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
+ src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
+ DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
+ DPxPTR(volume), element_size, num_dims);
+
+ if (!(dst || src)) {
+ DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
+ INT_MAX);
+ return INT_MAX;
+ }
+
+ if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
+ !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
+ DP("Call to omp_target_memcpy_rect with invalid arguments\n");
+ return OFFLOAD_FAIL;
+ }
+
+ int rc;
+ if (num_dims == 1) {
+ rc = omp_target_memcpy(dst, src, element_size * volume[0],
+ element_size * dst_offsets[0], element_size * src_offsets[0],
+ dst_device, src_device);
+ } else {
+ size_t dst_slice_size = element_size;
+ size_t src_slice_size = element_size;
+ for (int i=1; i<num_dims; ++i) {
+ dst_slice_size *= dst_dimensions[i];
+ src_slice_size *= src_dimensions[i];
+ }
+
+ size_t dst_off = dst_offsets[0] * dst_slice_size;
+ size_t src_off = src_offsets[0] * src_slice_size;
+ for (size_t i=0; i<volume[0]; ++i) {
+ rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
+ (char *) src + src_off + src_slice_size * i, element_size,
+ num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
+ dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
+
+ if (rc) {
+ DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
+ return rc;
+ }
+ }
+ }
+
+ DP("omp_target_memcpy_rect returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
+ size_t size, size_t device_offset, int device_num) {
+ DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
+ "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
+ DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
+
+ if (!host_ptr || !device_ptr || size <= 0) {
+ DP("Call to omp_target_associate_ptr with invalid arguments\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (device_num == omp_get_initial_device()) {
+ DP("omp_target_associate_ptr: no association possible on the host\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (!device_is_ready(device_num)) {
+ DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
+ return OFFLOAD_FAIL;
+ }
+
+ DeviceTy& Device = Devices[device_num];
+ void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
+ int rc = Device.associatePtr(host_ptr, device_addr, size);
+ DP("omp_target_associate_ptr returns %d\n", rc);
+ return rc;
+}
+
+EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
+ DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
+ "device_num %d\n", DPxPTR(host_ptr), device_num);
+
+ if (!host_ptr) {
+ DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (device_num == omp_get_initial_device()) {
+ DP("omp_target_disassociate_ptr: no association possible on the host\n");
+ return OFFLOAD_FAIL;
+ }
+
+ if (!device_is_ready(device_num)) {
+ DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
+ return OFFLOAD_FAIL;
+ }
+
+ DeviceTy& Device = Devices[device_num];
+ int rc = Device.disassociatePtr(host_ptr);
+ DP("omp_target_disassociate_ptr returns %d\n", rc);
+ return rc;
+}
diff --git a/final/libomptarget/src/device.cpp b/final/libomptarget/src/device.cpp
new file mode 100644
index 0000000..bac6127
--- /dev/null
+++ b/final/libomptarget/src/device.cpp
@@ -0,0 +1,365 @@
+//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <climits>
+#include <string>
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+DevicesTy Devices;
+
+int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
+ DataMapMtx.lock();
+
+ // Check if entry exists
+ for (auto &HT : HostDataToTargetMap) {
+ if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
+ // Mapping already exists
+ bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
+ HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
+ HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
+ DataMapMtx.unlock();
+ if (isValid) {
+ DP("Attempt to re-associate the same device ptr+offset with the same "
+ "host ptr, nothing to do\n");
+ return OFFLOAD_SUCCESS;
+ } else {
+ DP("Not allowed to re-associate a different device ptr+offset with the "
+ "same host ptr\n");
+ return OFFLOAD_FAIL;
+ }
+ }
+ }
+
+ // Mapping does not exist, allocate it
+ HostDataToTargetTy newEntry;
+
+ // Set up missing fields
+ newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
+ newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
+ newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
+ newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
+ // refCount must be infinite
+ newEntry.RefCount = INF_REF_CNT;
+
+ DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
+ DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
+ DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
+ DPxPTR(newEntry.TgtPtrBegin));
+ HostDataToTargetMap.push_front(newEntry);
+
+ DataMapMtx.unlock();
+
+ return OFFLOAD_SUCCESS;
+}
+
+int DeviceTy::disassociatePtr(void *HstPtrBegin) {
+ DataMapMtx.lock();
+
+ // Check if entry exists
+ for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
+ ii != HostDataToTargetMap.end(); ++ii) {
+ if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
+ // Mapping exists
+ if (CONSIDERED_INF(ii->RefCount)) {
+ DP("Association found, removing it\n");
+ HostDataToTargetMap.erase(ii);
+ DataMapMtx.unlock();
+ return OFFLOAD_SUCCESS;
+ } else {
+ DP("Trying to disassociate a pointer which was not mapped via "
+ "omp_target_associate_ptr\n");
+ break;
+ }
+ }
+ }
+
+ // Mapping not found
+ DataMapMtx.unlock();
+ DP("Association not found\n");
+ return OFFLOAD_FAIL;
+}
+
+// Get ref count of map entry containing HstPtrBegin
+long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
+ uintptr_t hp = (uintptr_t)HstPtrBegin;
+ long RefCnt = -1;
+
+ DataMapMtx.lock();
+ for (auto &HT : HostDataToTargetMap) {
+ if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
+ DP("DeviceTy::getMapEntry: requested entry found\n");
+ RefCnt = HT.RefCount;
+ break;
+ }
+ }
+ DataMapMtx.unlock();
+
+ if (RefCnt < 0) {
+ DP("DeviceTy::getMapEntry: requested entry not found\n");
+ }
+
+ return RefCnt;
+}
+
+LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
+ uintptr_t hp = (uintptr_t)HstPtrBegin;
+ LookupResult lr;
+
+ DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
+ Size);
+ for (lr.Entry = HostDataToTargetMap.begin();
+ lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
+ auto &HT = *lr.Entry;
+ // Is it contained?
+ lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
+ (hp+Size) <= HT.HstPtrEnd;
+ // Does it extend into an already mapped region?
+ lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
+ // Does it extend beyond the mapped region?
+ lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
+
+ if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
+ lr.Flags.ExtendsAfter) {
+ break;
+ }
+ }
+
+ if (lr.Flags.ExtendsBefore) {
+ DP("WARNING: Pointer is not mapped but section extends into already "
+ "mapped data\n");
+ }
+ if (lr.Flags.ExtendsAfter) {
+ DP("WARNING: Pointer is already mapped but section extends beyond mapped "
+ "region\n");
+ }
+
+ return lr;
+}
+
+// Used by target_data_begin
+// Return the target pointer begin (where the data will be moved).
+// Allocate memory if this is the first occurrence if this mapping.
+// Increment the reference counter.
+// If NULL is returned, then either data allocation failed or the user tried
+// to do an illegal mapping.
+void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
+ int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
+ void *rc = NULL;
+ DataMapMtx.lock();
+ LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+ // Check if the pointer is contained.
+ if (lr.Flags.IsContained ||
+ ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
+ auto &HT = *lr.Entry;
+ IsNew = false;
+
+ if (UpdateRefCount)
+ ++HT.RefCount;
+
+ uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+ DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+ "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
+ DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+ (UpdateRefCount ? " updated" : ""),
+ (CONSIDERED_INF(HT.RefCount)) ? "INF" :
+ std::to_string(HT.RefCount).c_str());
+ rc = (void *)tp;
+ } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
+ // Explicit extension of mapped data - not allowed.
+ DP("Explicit extension of mapping is not allowed.\n");
+ } else if (Size) {
+ // If it is not contained and Size > 0 we should create a new entry for it.
+ IsNew = true;
+ uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
+ DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+ "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+ DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+ HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
+ (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
+ rc = (void *)tp;
+ }
+
+ DataMapMtx.unlock();
+ return rc;
+}
+
+// Used by target_data_begin, target_data_end, target_data_update and target.
+// Return the target pointer begin (where the data will be moved).
+// Decrement the reference counter if called from target_data_end.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+ bool UpdateRefCount) {
+ void *rc = NULL;
+ DataMapMtx.lock();
+ LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+ if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+ auto &HT = *lr.Entry;
+ IsLast = !(HT.RefCount > 1);
+
+ if (HT.RefCount > 1 && UpdateRefCount)
+ --HT.RefCount;
+
+ uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+ DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+ "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+ (UpdateRefCount ? " updated" : ""),
+ (CONSIDERED_INF(HT.RefCount)) ? "INF" :
+ std::to_string(HT.RefCount).c_str());
+ rc = (void *)tp;
+ } else {
+ IsLast = false;
+ }
+
+ DataMapMtx.unlock();
+ return rc;
+}
+
+// Return the target pointer begin (where the data will be moved).
+// Lock-free version called when loading global symbols from the fat binary.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
+ uintptr_t hp = (uintptr_t)HstPtrBegin;
+ LookupResult lr = lookupMapping(HstPtrBegin, Size);
+ if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+ auto &HT = *lr.Entry;
+ uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
+ return (void *)tp;
+ }
+
+ return NULL;
+}
+
+int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
+ // Check if the pointer is contained in any sub-nodes.
+ int rc;
+ DataMapMtx.lock();
+ LookupResult lr = lookupMapping(HstPtrBegin, Size);
+ if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+ auto &HT = *lr.Entry;
+ if (ForceDelete)
+ HT.RefCount = 1;
+ if (--HT.RefCount <= 0) {
+ assert(HT.RefCount == 0 && "did not expect a negative ref count");
+ DP("Deleting tgt data " DPxMOD " of size %ld\n",
+ DPxPTR(HT.TgtPtrBegin), Size);
+ RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
+ DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
+ ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
+ DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
+ HostDataToTargetMap.erase(lr.Entry);
+ }
+ rc = OFFLOAD_SUCCESS;
+ } else {
+ DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
+ " memory\n", DPxPTR(HstPtrBegin));
+ rc = OFFLOAD_FAIL;
+ }
+
+ DataMapMtx.unlock();
+ return rc;
+}
+
+/// Init device, should not be called directly.
+void DeviceTy::init() {
+ int32_t rc = RTL->init_device(RTLDeviceID);
+ if (rc == OFFLOAD_SUCCESS) {
+ IsInit = true;
+ }
+}
+
+/// Thread-safe method to initialize the device only once.
+int32_t DeviceTy::initOnce() {
+ std::call_once(InitFlag, &DeviceTy::init, this);
+
+ // At this point, if IsInit is true, then either this thread or some other
+ // thread in the past successfully initialized the device, so we can return
+ // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
+ // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
+ // that some other thread already attempted to execute init() and if IsInit
+ // is still false, return OFFLOAD_FAIL.
+ if (IsInit)
+ return OFFLOAD_SUCCESS;
+ else
+ return OFFLOAD_FAIL;
+}
+
+// Load binary to device.
+__tgt_target_table *DeviceTy::load_binary(void *Img) {
+ RTL->Mtx.lock();
+ __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
+ RTL->Mtx.unlock();
+ return rc;
+}
+
+// Submit data to device.
+int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
+ int64_t Size) {
+ return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+}
+
+// Retrieve data from device.
+int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
+ int64_t Size) {
+ return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+}
+
+// Run region on device
+int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
+ return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+ TgtVarsSize);
+}
+
+// Run team region on device.
+int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
+ int32_t ThreadLimit, uint64_t LoopTripCount) {
+ return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+ TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
+}
+
+/// Check whether a device has an associated RTL and initialize it if it's not
+/// already initialized.
+bool device_is_ready(int device_num) {
+ DP("Checking whether device %d is ready.\n", device_num);
+ // Devices.size() can only change while registering a new
+ // library, so try to acquire the lock of RTLs' mutex.
+ RTLsMtx.lock();
+ size_t Devices_size = Devices.size();
+ RTLsMtx.unlock();
+ if (Devices_size <= (size_t)device_num) {
+ DP("Device ID %d does not have a matching RTL\n", device_num);
+ return false;
+ }
+
+ // Get device info
+ DeviceTy &Device = Devices[device_num];
+
+ DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
+ Device.RTLDeviceID, Device.IsInit);
+
+ // Init the device if not done before
+ if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
+ DP("Failed to init device %d\n", device_num);
+ return false;
+ }
+
+ DP("Device %d is ready to use.\n", device_num);
+
+ return true;
+}
diff --git a/final/libomptarget/src/device.h b/final/libomptarget/src/device.h
new file mode 100644
index 0000000..3c205d6
--- /dev/null
+++ b/final/libomptarget/src/device.h
@@ -0,0 +1,167 @@
+//===----------- device.h - Target independent OpenMP target RTL ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_DEVICE_H
+#define _OMPTARGET_DEVICE_H
+
+#include <cstddef>
+#include <climits>
+#include <list>
+#include <map>
+#include <mutex>
+#include <vector>
+
+// Forward declarations.
+struct RTLInfoTy;
+struct __tgt_bin_desc;
+struct __tgt_target_table;
+
+#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
+#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
+
+/// Map between host data and target data.
+struct HostDataToTargetTy {
+ uintptr_t HstPtrBase; // host info.
+ uintptr_t HstPtrBegin;
+ uintptr_t HstPtrEnd; // non-inclusive.
+
+ uintptr_t TgtPtrBegin; // target info.
+
+ long RefCount;
+
+ HostDataToTargetTy()
+ : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
+ TgtPtrBegin(0), RefCount(0) {}
+ HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
+ : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
+ TgtPtrBegin(TB), RefCount(1) {}
+ HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
+ long RF)
+ : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
+ TgtPtrBegin(TB), RefCount(RF) {}
+};
+
+typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
+
+struct LookupResult {
+ struct {
+ unsigned IsContained : 1;
+ unsigned ExtendsBefore : 1;
+ unsigned ExtendsAfter : 1;
+ } Flags;
+
+ HostDataToTargetListTy::iterator Entry;
+
+ LookupResult() : Flags({0,0,0}), Entry() {}
+};
+
+/// Map for shadow pointers
+struct ShadowPtrValTy {
+ void *HstPtrVal;
+ void *TgtPtrAddr;
+ void *TgtPtrVal;
+};
+typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
+
+///
+struct PendingCtorDtorListsTy {
+ std::list<void *> PendingCtors;
+ std::list<void *> PendingDtors;
+};
+typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
+ PendingCtorsDtorsPerLibrary;
+
+struct DeviceTy {
+ int32_t DeviceID;
+ RTLInfoTy *RTL;
+ int32_t RTLDeviceID;
+
+ bool IsInit;
+ std::once_flag InitFlag;
+ bool HasPendingGlobals;
+
+ HostDataToTargetListTy HostDataToTargetMap;
+ PendingCtorsDtorsPerLibrary PendingCtorsDtors;
+
+ ShadowPtrListTy ShadowPtrMap;
+
+ std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
+
+ uint64_t loopTripCnt;
+
+ DeviceTy(RTLInfoTy *RTL)
+ : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
+ HasPendingGlobals(false), HostDataToTargetMap(),
+ PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(),
+ ShadowMtx(), loopTripCnt(0) {}
+
+ // The existence of mutexes makes DeviceTy non-copyable. We need to
+ // provide a copy constructor and an assignment operator explicitly.
+ DeviceTy(const DeviceTy &d)
+ : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
+ IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
+ HostDataToTargetMap(d.HostDataToTargetMap),
+ PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
+ DataMapMtx(), PendingGlobalsMtx(),
+ ShadowMtx(), loopTripCnt(d.loopTripCnt) {}
+
+ DeviceTy& operator=(const DeviceTy &d) {
+ DeviceID = d.DeviceID;
+ RTL = d.RTL;
+ RTLDeviceID = d.RTLDeviceID;
+ IsInit = d.IsInit;
+ HasPendingGlobals = d.HasPendingGlobals;
+ HostDataToTargetMap = d.HostDataToTargetMap;
+ PendingCtorsDtors = d.PendingCtorsDtors;
+ ShadowPtrMap = d.ShadowPtrMap;
+ loopTripCnt = d.loopTripCnt;
+
+ return *this;
+ }
+
+ long getMapEntryRefCnt(void *HstPtrBegin);
+ LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
+ void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
+ bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
+ void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
+ void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+ bool UpdateRefCount);
+ int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
+ int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+ int disassociatePtr(void *HstPtrBegin);
+
+ // calls to RTL
+ int32_t initOnce();
+ __tgt_target_table *load_binary(void *Img);
+
+ int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
+ int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+
+ int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
+ int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+ ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
+ int32_t ThreadLimit, uint64_t LoopTripCount);
+
+private:
+ // Call to RTL
+ void init(); // To be called only via DeviceTy::initOnce()
+};
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+typedef std::vector<DeviceTy> DevicesTy;
+extern DevicesTy Devices;
+
+extern bool device_is_ready(int device_num);
+
+#endif
diff --git a/final/libomptarget/src/exports b/final/libomptarget/src/exports
new file mode 100644
index 0000000..8114751
--- /dev/null
+++ b/final/libomptarget/src/exports
@@ -0,0 +1,28 @@
+VERS1.0 {
+ global:
+ __tgt_register_lib;
+ __tgt_unregister_lib;
+ __tgt_target_data_begin;
+ __tgt_target_data_end;
+ __tgt_target_data_update;
+ __tgt_target;
+ __tgt_target_teams;
+ __tgt_target_data_begin_nowait;
+ __tgt_target_data_end_nowait;
+ __tgt_target_data_update_nowait;
+ __tgt_target_nowait;
+ __tgt_target_teams_nowait;
+ omp_get_num_devices;
+ omp_get_initial_device;
+ omp_target_alloc;
+ omp_target_free;
+ omp_target_is_present;
+ omp_target_memcpy;
+ omp_target_memcpy_rect;
+ omp_target_associate_ptr;
+ omp_target_disassociate_ptr;
+ __kmpc_push_target_tripcount;
+ local:
+ *;
+};
+
diff --git a/final/libomptarget/src/interface.cpp b/final/libomptarget/src/interface.cpp
new file mode 100644
index 0000000..0f32f4e
--- /dev/null
+++ b/final/libomptarget/src/interface.cpp
@@ -0,0 +1,251 @@
+//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds a target shared library to the target execution image
+EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
+ RTLs.RegisterLib(desc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// unloads a target shared library
+EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
+ RTLs.UnregisterLib(desc);
+}
+
+/// creates host-to-target data mapping, stores it in the
+/// libomptarget.so internal structure (an entry in a stack of data maps)
+/// and passes the data to the device.
+EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
+ device_id, arg_num);
+
+ // No devices available?
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ DP("Use default device id %" PRId64 "\n", device_id);
+ }
+
+ if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+ DP("Failed to get device %" PRId64 " ready\n", device_id);
+ return;
+ }
+
+ DeviceTy& Device = Devices[device_id];
+
+#ifdef OMPTARGET_DEBUG
+ for (int i=0; i<arg_num; ++i) {
+ DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+ ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+ arg_sizes[i], arg_types[i]);
+ }
+#endif
+
+ target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
+}
+
+EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+ int32_t depNum, void *depList, int32_t noAliasDepNum,
+ void *noAliasDepList) {
+ if (depNum + noAliasDepNum > 0)
+ __kmpc_omp_taskwait(NULL, 0);
+
+ __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+ arg_types);
+}
+
+/// passes data from the target, releases target memory and destroys
+/// the host-target mapping (top entry from the stack of data maps)
+/// created by the last __tgt_target_data_begin.
+EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ DP("Entering data end region with %d mappings\n", arg_num);
+
+ // No devices available?
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ }
+
+ RTLsMtx.lock();
+ size_t Devices_size = Devices.size();
+ RTLsMtx.unlock();
+ if (Devices_size <= (size_t)device_id) {
+ DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id);
+ return;
+ }
+
+ DeviceTy &Device = Devices[device_id];
+ if (!Device.IsInit) {
+ DP("Uninit device: ignore");
+ return;
+ }
+
+#ifdef OMPTARGET_DEBUG
+ for (int i=0; i<arg_num; ++i) {
+ DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+ ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+ arg_sizes[i], arg_types[i]);
+ }
+#endif
+
+ target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
+}
+
+EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+ int32_t depNum, void *depList, int32_t noAliasDepNum,
+ void *noAliasDepList) {
+ if (depNum + noAliasDepNum > 0)
+ __kmpc_omp_taskwait(NULL, 0);
+
+ __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+ arg_types);
+}
+
+EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ DP("Entering data update with %d mappings\n", arg_num);
+
+ // No devices available?
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ }
+
+ if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+ DP("Failed to get device %" PRId64 " ready\n", device_id);
+ return;
+ }
+
+ DeviceTy& Device = Devices[device_id];
+ target_data_update(Device, arg_num, args_base, args, arg_sizes, arg_types);
+}
+
+EXTERN void __tgt_target_data_update_nowait(
+ int64_t device_id, int32_t arg_num, void **args_base, void **args,
+ int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
+ int32_t noAliasDepNum, void *noAliasDepList) {
+ if (depNum + noAliasDepNum > 0)
+ __kmpc_omp_taskwait(NULL, 0);
+
+ __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
+ arg_types);
+}
+
+EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ DP("Entering target region with entry point " DPxMOD " and device Id %"
+ PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ }
+
+ if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+ DP("Failed to get device %" PRId64 " ready\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+#ifdef OMPTARGET_DEBUG
+ for (int i=0; i<arg_num; ++i) {
+ DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+ ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+ arg_sizes[i], arg_types[i]);
+ }
+#endif
+
+ int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+ arg_types, 0, 0, false /*team*/);
+
+ return rc;
+}
+
+EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
+ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
+ void *noAliasDepList) {
+ if (depNum + noAliasDepNum > 0)
+ __kmpc_omp_taskwait(NULL, 0);
+
+ return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+ arg_types);
+}
+
+EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
+ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
+ DP("Entering target region with entry point " DPxMOD " and device Id %"
+ PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ }
+
+ if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+ DP("Failed to get device %" PRId64 " ready\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+#ifdef OMPTARGET_DEBUG
+ for (int i=0; i<arg_num; ++i) {
+ DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+ ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+ arg_sizes[i], arg_types[i]);
+ }
+#endif
+
+ int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+ arg_types, team_num, thread_limit, true /*team*/);
+
+ return rc;
+}
+
+EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+ int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+ int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
+ void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+ if (depNum + noAliasDepNum > 0)
+ __kmpc_omp_taskwait(NULL, 0);
+
+ return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
+ arg_sizes, arg_types, team_num, thread_limit);
+}
+
+
+// The trip count mechanism will be revised - this scheme is not thread-safe.
+EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
+ uint64_t loop_tripcount) {
+ if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+ device_id = omp_get_default_device();
+ }
+
+ if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+ DP("Failed to get device %" PRId64 " ready\n", device_id);
+ return;
+ }
+
+ DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
+ loop_tripcount);
+ Devices[device_id].loopTripCnt = loop_tripcount;
+}
diff --git a/final/libomptarget/src/omptarget.cpp b/final/libomptarget/src/omptarget.cpp
new file mode 100644
index 0000000..3cc09b4
--- /dev/null
+++ b/final/libomptarget/src/omptarget.cpp
@@ -0,0 +1,714 @@
+//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <vector>
+
+#ifdef OMPTARGET_DEBUG
+int DebugLevel = 0;
+#endif // OMPTARGET_DEBUG
+
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ * int a; // 4-aligned
+ * int b; // 4-aligned
+ * int *p; // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ * s1.b = 5;
+ * for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
+/// Map global data and execute pending ctors
+static int InitLibrary(DeviceTy& Device) {
+ /*
+ * Map global data
+ */
+ int32_t device_id = Device.DeviceID;
+ int rc = OFFLOAD_SUCCESS;
+
+ Device.PendingGlobalsMtx.lock();
+ TrlTblMtx.lock();
+ for (HostEntriesBeginToTransTableTy::iterator
+ ii = HostEntriesBeginToTransTable.begin();
+ ii != HostEntriesBeginToTransTable.end(); ++ii) {
+ TranslationTable *TransTable = &ii->second;
+ if (TransTable->TargetsTable[device_id] != 0) {
+ // Library entries have already been processed
+ continue;
+ }
+
+ // 1) get image.
+ assert(TransTable->TargetsImages.size() > (size_t)device_id &&
+ "Not expecting a device ID outside the table's bounds!");
+ __tgt_device_image *img = TransTable->TargetsImages[device_id];
+ if (!img) {
+ DP("No image loaded for device id %d.\n", device_id);
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+ // 2) load image into the target table.
+ __tgt_target_table *TargetTable =
+ TransTable->TargetsTable[device_id] = Device.load_binary(img);
+ // Unable to get table for this image: invalidate image and fail.
+ if (!TargetTable) {
+ DP("Unable to generate entries table for device id %d.\n", device_id);
+ TransTable->TargetsImages[device_id] = 0;
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+
+ // Verify whether the two table sizes match.
+ size_t hsize =
+ TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
+ size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
+
+ // Invalid image for these host entries!
+ if (hsize != tsize) {
+ DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
+ device_id, hsize, tsize);
+ TransTable->TargetsImages[device_id] = 0;
+ TransTable->TargetsTable[device_id] = 0;
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+
+ // process global data that needs to be mapped.
+ Device.DataMapMtx.lock();
+ __tgt_target_table *HostTable = &TransTable->HostTable;
+ for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
+ *CurrHostEntry = HostTable->EntriesBegin,
+ *EntryDeviceEnd = TargetTable->EntriesEnd;
+ CurrDeviceEntry != EntryDeviceEnd;
+ CurrDeviceEntry++, CurrHostEntry++) {
+ if (CurrDeviceEntry->size != 0) {
+ // has data.
+ assert(CurrDeviceEntry->size == CurrHostEntry->size &&
+ "data size mismatch");
+
+ // Fortran may use multiple weak declarations for the same symbol,
+ // therefore we must allow for multiple weak symbols to be loaded from
+ // the fat binary. Treat these mappings as any other "regular" mapping.
+ // Add entry to map.
+ if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
+ continue;
+ DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
+ "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
+ CurrDeviceEntry->size);
+ Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+ (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+ (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+ (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
+ (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
+ INF_REF_CNT /*RefCount*/));
+ }
+ }
+ Device.DataMapMtx.unlock();
+ }
+ TrlTblMtx.unlock();
+
+ if (rc != OFFLOAD_SUCCESS) {
+ Device.PendingGlobalsMtx.unlock();
+ return rc;
+ }
+
+ /*
+ * Run ctors for static objects
+ */
+ if (!Device.PendingCtorsDtors.empty()) {
+ // Call all ctors for all libraries registered so far
+ for (auto &lib : Device.PendingCtorsDtors) {
+ if (!lib.second.PendingCtors.empty()) {
+ DP("Has pending ctors... call now\n");
+ for (auto &entry : lib.second.PendingCtors) {
+ void *ctor = entry;
+ int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
+ NULL, 1, 1, true /*team*/);
+ if (rc != OFFLOAD_SUCCESS) {
+ DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
+ Device.PendingGlobalsMtx.unlock();
+ return OFFLOAD_FAIL;
+ }
+ }
+ // Clear the list to indicate that this device has been used
+ lib.second.PendingCtors.clear();
+ DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
+ }
+ }
+ }
+ Device.HasPendingGlobals = false;
+ Device.PendingGlobalsMtx.unlock();
+
+ return OFFLOAD_SUCCESS;
+}
+
+// Check whether a device has been initialized, global ctors have been
+// executed and global data has been mapped; do so if not already done.
+int CheckDeviceAndCtors(int64_t device_id) {
+ // Is device ready?
+ if (!device_is_ready(device_id)) {
+ DP("Device %" PRId64 " is not ready.\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+ // Get device info.
+ DeviceTy &Device = Devices[device_id];
+
+ // Check whether global data has been mapped for this device
+ Device.PendingGlobalsMtx.lock();
+ bool hasPendingGlobals = Device.HasPendingGlobals;
+ Device.PendingGlobalsMtx.unlock();
+ if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
+ DP("Failed to init globals on device %" PRId64 "\n", device_id);
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
+static int32_t member_of(int64_t type) {
+ return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
+}
+
+/// Internal function to do the mapping and transfer the data to the device
+int target_data_begin(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ // process each input.
+ int rc = OFFLOAD_SUCCESS;
+ for (int32_t i = 0; i < arg_num; ++i) {
+ // Ignore private variables and arrays - there is no mapping for them.
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ void *HstPtrBase = args_base[i];
+ int64_t data_size = arg_sizes[i];
+
+ // Adjust for proper alignment if this is a combined entry (for structs).
+ // Look at the next argument - if that is MEMBER_OF this one, then this one
+ // is a combined entry.
+ int64_t padding = 0;
+ const int next_i = i+1;
+ if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+ member_of(arg_types[next_i]) == i) {
+ padding = (int64_t)HstPtrBegin % alignment;
+ if (padding) {
+ DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+ "\n", padding, DPxPTR(HstPtrBegin));
+ HstPtrBegin = (char *) HstPtrBegin - padding;
+ data_size += padding;
+ }
+ }
+
+ // Address of pointer on the host and device, respectively.
+ void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
+ bool IsNew, Pointer_IsNew;
+ bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+ // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+ // have reached this point via __tgt_target_data_begin and not __tgt_target
+ // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+ // associated with a target region, so there are no target parameters). This
+ // may be considered a hack, we could revise the scheme in the future.
+ bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
+ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ DP("Has a pointer entry: \n");
+ // base is address of pointer.
+ Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
+ sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
+ if (!Pointer_TgtPtrBegin) {
+ DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+ "illegal mapping).\n");
+ }
+ DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
+ "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
+ (Pointer_IsNew ? "" : " not"));
+ Pointer_HstPtrBegin = HstPtrBase;
+ // modify current entry.
+ HstPtrBase = *(void **)HstPtrBase;
+ UpdateRef = true; // subsequently update ref count of pointee
+ }
+
+ void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
+ data_size, IsNew, IsImplicit, UpdateRef);
+ if (!TgtPtrBegin && data_size) {
+ // If data_size==0, then the argument could be a zero-length pointer to
+ // NULL, so getOrAlloc() returning NULL is not an error.
+ DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+ "illegal mapping).\n");
+ }
+ DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+ " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
+ (IsNew ? "" : " not"));
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
+ uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+ void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+ DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+ args_base[i] = TgtPtrBase;
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ bool copy = false;
+ if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
+ copy = true;
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ copy = true;
+ }
+ }
+
+ if (copy) {
+ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+ data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
+ DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+ uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+ void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
+ int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
+ sizeof(void *));
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ // create shadow pointers for this entry
+ Device.ShadowMtx.lock();
+ Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
+ Pointer_TgtPtrBegin, TgtPtrBase};
+ Device.ShadowMtx.unlock();
+ }
+ }
+
+ return rc;
+}
+
+/// Internal function to undo the mapping and retrieve the data from the device.
+int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+ void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ int rc = OFFLOAD_SUCCESS;
+ // process each input.
+ for (int32_t i = arg_num - 1; i >= 0; --i) {
+ // Ignore private variables and arrays - there is no mapping for them.
+ // Also, ignore the use_device_ptr directive, it has no effect here.
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ int64_t data_size = arg_sizes[i];
+ // Adjust for proper alignment if this is a combined entry (for structs).
+ // Look at the next argument - if that is MEMBER_OF this one, then this one
+ // is a combined entry.
+ int64_t padding = 0;
+ const int next_i = i+1;
+ if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+ member_of(arg_types[next_i]) == i) {
+ padding = (int64_t)HstPtrBegin % alignment;
+ if (padding) {
+ DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+ "\n", padding, DPxPTR(HstPtrBegin));
+ HstPtrBegin = (char *) HstPtrBegin - padding;
+ data_size += padding;
+ }
+ }
+
+ bool IsLast;
+ bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
+ bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
+
+ // If PTR_AND_OBJ, HstPtrBegin is address of pointee
+ void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
+ UpdateRef);
+ DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+ " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
+ (IsLast ? "" : " not"));
+
+ bool DelEntry = IsLast || ForceDelete;
+
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+ !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+ DelEntry = false; // protect parent struct from being deallocated
+ }
+
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
+ // Move data back to the host
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
+ bool CopyMember = false;
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+ !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+ // Copy data only if the "parent" struct has RefCount==1.
+ int32_t parent_idx = member_of(arg_types[i]);
+ long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+ assert(parent_rc > 0 && "parent struct not found");
+ if (parent_rc == 1) {
+ CopyMember = true;
+ }
+ }
+
+ if (DelEntry || Always || CopyMember) {
+ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+ data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+ int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Copying data from device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+
+ // If we copied back to the host a struct/array containing pointers, we
+ // need to restore the original host pointer values from their shadow
+ // copies. If the struct is going to be deallocated, remove any remaining
+ // shadow pointer entries for this struct.
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+
+ // An STL map is sorted on its keys; use this property
+ // to quickly determine when to break out of the loop.
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+
+ // If we copied the struct to the host, we need to restore the pointer.
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ DP("Restoring original host pointer value " DPxMOD " for host "
+ "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+ DPxPTR(ShadowHstPtrAddr));
+ *ShadowHstPtrAddr = it->second.HstPtrVal;
+ }
+ // If the struct is to be deallocated, remove the shadow entry.
+ if (DelEntry) {
+ DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+ Device.ShadowPtrMap.erase(it);
+ }
+ }
+ Device.ShadowMtx.unlock();
+
+ // Deallocate map
+ if (DelEntry) {
+ int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Deallocating data from device failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+ }
+ }
+
+ return rc;
+}
+
+/// Internal function to pass data to/from the target.
+void target_data_update(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+ // process each input.
+ for (int32_t i = 0; i < arg_num; ++i) {
+ if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+ (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+ continue;
+
+ void *HstPtrBegin = args[i];
+ int64_t MapSize = arg_sizes[i];
+ bool IsLast;
+ void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
+ false);
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+ DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+ arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+ Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
+
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+ DP("Restoring original host pointer value " DPxMOD " for host pointer "
+ DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+ DPxPTR(ShadowHstPtrAddr));
+ *ShadowHstPtrAddr = it->second.HstPtrVal;
+ }
+ Device.ShadowMtx.unlock();
+ }
+
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+ arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+ Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
+
+ uintptr_t lb = (uintptr_t) HstPtrBegin;
+ uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+ Device.ShadowMtx.lock();
+ for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+ it != Device.ShadowPtrMap.end(); ++it) {
+ void **ShadowHstPtrAddr = (void**) it->first;
+ if ((uintptr_t) ShadowHstPtrAddr < lb)
+ continue;
+ if ((uintptr_t) ShadowHstPtrAddr >= ub)
+ break;
+ DP("Restoring original target pointer value " DPxMOD " for target "
+ "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
+ DPxPTR(it->second.TgtPtrAddr));
+ Device.data_submit(it->second.TgtPtrAddr,
+ &it->second.TgtPtrVal, sizeof(void *));
+ }
+ Device.ShadowMtx.unlock();
+ }
+ }
+}
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer different from zero otherwise.
+int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+ int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
+ DeviceTy &Device = Devices[device_id];
+
+ // Find the table information in the map or look it up in the translation
+ // tables.
+ TableMap *TM = 0;
+ TblMapMtx.lock();
+ HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
+ if (TableMapIt == HostPtrToTableMap.end()) {
+ // We don't have a map. So search all the registered libraries.
+ TrlTblMtx.lock();
+ for (HostEntriesBeginToTransTableTy::iterator
+ ii = HostEntriesBeginToTransTable.begin(),
+ ie = HostEntriesBeginToTransTable.end();
+ !TM && ii != ie; ++ii) {
+ // get the translation table (which contains all the good info).
+ TranslationTable *TransTable = &ii->second;
+ // iterate over all the host table entries to see if we can locate the
+ // host_ptr.
+ __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
+ __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
+ __tgt_offload_entry *cur = begin;
+ for (uint32_t i = 0; cur < end; ++cur, ++i) {
+ if (cur->addr != host_ptr)
+ continue;
+ // we got a match, now fill the HostPtrToTableMap so that we
+ // may avoid this search next time.
+ TM = &HostPtrToTableMap[host_ptr];
+ TM->Table = TransTable;
+ TM->Index = i;
+ break;
+ }
+ }
+ TrlTblMtx.unlock();
+ } else {
+ TM = &TableMapIt->second;
+ }
+ TblMapMtx.unlock();
+
+ // No map for this host pointer found!
+ if (!TM) {
+ DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+ DPxPTR(host_ptr));
+ return OFFLOAD_FAIL;
+ }
+
+ // get target table.
+ TrlTblMtx.lock();
+ assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
+ "Not expecting a device ID outside the table's bounds!");
+ __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
+ TrlTblMtx.unlock();
+ assert(TargetTable && "Global data has not been mapped\n");
+
+ // Move data to device.
+ int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
+ arg_types);
+
+ if (rc != OFFLOAD_SUCCESS) {
+ DP("Call to target_data_begin failed, skipping target execution.\n");
+ // Call target_data_end to dealloc whatever target_data_begin allocated
+ // and return OFFLOAD_FAIL.
+ target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
+ return OFFLOAD_FAIL;
+ }
+
+ std::vector<void *> tgt_args;
+ std::vector<ptrdiff_t> tgt_offsets;
+
+ // List of (first-)private arrays allocated for this target region
+ std::vector<void *> fpArrays;
+
+ for (int32_t i = 0; i < arg_num; ++i) {
+ if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
+ // This is not a target parameter, do not push it into tgt_args.
+ continue;
+ }
+ void *HstPtrBegin = args[i];
+ void *HstPtrBase = args_base[i];
+ void *TgtPtrBegin;
+ ptrdiff_t TgtBaseOffset;
+ bool IsLast; // unused.
+ if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
+ DP("Forwarding first-private value " DPxMOD " to the target construct\n",
+ DPxPTR(HstPtrBase));
+ TgtPtrBegin = HstPtrBase;
+ TgtBaseOffset = 0;
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
+ // Allocate memory for (first-)private array
+ TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
+ arg_sizes[i], HstPtrBegin);
+ if (!TgtPtrBegin) {
+ DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
+ (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+ DPxPTR(HstPtrBegin));
+ rc = OFFLOAD_FAIL;
+ break;
+ } else {
+ fpArrays.push_back(TgtPtrBegin);
+ TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+ void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+ DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
+ "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
+ arg_sizes[i], DPxPTR(TgtPtrBegin),
+ (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+ DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
+#endif
+ // If first-private, copy data from host
+ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+ int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP ("Copying data to device failed.\n");
+ rc = OFFLOAD_FAIL;
+ break;
+ }
+ }
+ }
+ } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+ TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
+ false);
+ TgtBaseOffset = 0; // no offset for ptrs.
+ DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
+ "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
+ DPxPTR(HstPtrBase));
+ } else {
+ TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+ false);
+ TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+ void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+ DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
+ DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+#endif
+ }
+ tgt_args.push_back(TgtPtrBegin);
+ tgt_offsets.push_back(TgtBaseOffset);
+ }
+
+ assert(tgt_args.size() == tgt_offsets.size() &&
+ "Size mismatch in arguments and offsets");
+
+ // Pop loop trip count
+ uint64_t ltc = Device.loopTripCnt;
+ Device.loopTripCnt = 0;
+
+ // Launch device execution.
+ if (rc == OFFLOAD_SUCCESS) {
+ DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+ TargetTable->EntriesBegin[TM->Index].name,
+ DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
+ if (IsTeamConstruct) {
+ rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
+ thread_limit, ltc);
+ } else {
+ rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
+ &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+ }
+ } else {
+ DP("Errors occurred while obtaining target arguments, skipping kernel "
+ "execution\n");
+ }
+
+ // Deallocate (first-)private arrays
+ for (auto it : fpArrays) {
+ int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Deallocation of (first-)private arrays failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+ }
+
+ // Move data from device.
+ int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
+ arg_types);
+
+ if (rt != OFFLOAD_SUCCESS) {
+ DP("Call to target_data_end failed.\n");
+ rc = OFFLOAD_FAIL;
+ }
+
+ return rc;
+}
diff --git a/final/libomptarget/src/private.h b/final/libomptarget/src/private.h
new file mode 100644
index 0000000..7bdadc0
--- /dev/null
+++ b/final/libomptarget/src/private.h
@@ -0,0 +1,59 @@
+//===---------- private.h - Target independent OpenMP target RTL ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Private function declarations and helper macros for debugging output.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_PRIVATE_H
+#define _OMPTARGET_PRIVATE_H
+
+#include <omptarget.h>
+
+#include <cstdint>
+
+extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+ void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern void target_data_update(DeviceTy &Device, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+ void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+ int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
+
+extern int CheckDeviceAndCtors(int64_t device_id);
+
+// Implemented in libomp, they are called from within __tgt_* functions.
+#ifdef __cplusplus
+extern "C" {
+#endif
+int omp_get_default_device(void) __attribute__((weak));
+int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak));
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+extern int DebugLevel;
+
+#define DP(...) \
+ do { \
+ if (DebugLevel > 0) { \
+ DEBUGP("Libomptarget", __VA_ARGS__); \
+ } \
+ } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#endif
diff --git a/final/libomptarget/src/rtl.cpp b/final/libomptarget/src/rtl.cpp
new file mode 100644
index 0000000..3152528
--- /dev/null
+++ b/final/libomptarget/src/rtl.cpp
@@ -0,0 +1,368 @@
+//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <mutex>
+#include <string>
+
+// List of all plugins that can support offloading.
+static const char *RTLNames[] = {
+ /* PowerPC target */ "libomptarget.rtl.ppc64.so",
+ /* x86_64 target */ "libomptarget.rtl.x86_64.so",
+ /* CUDA target */ "libomptarget.rtl.cuda.so",
+ /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
+
+RTLsTy RTLs;
+std::mutex RTLsMtx;
+
+HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+std::mutex TrlTblMtx;
+
+HostPtrToTableMapTy HostPtrToTableMap;
+std::mutex TblMapMtx;
+
+void RTLsTy::LoadRTLs() {
+#ifdef OMPTARGET_DEBUG
+ if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+ DebugLevel = std::stoi(envStr);
+ }
+#endif // OMPTARGET_DEBUG
+
+ // Parse environment variable OMP_TARGET_OFFLOAD (if set)
+ char *envStr = getenv("OMP_TARGET_OFFLOAD");
+ if (envStr && !strcmp(envStr, "DISABLED")) {
+ DP("Target offloading disabled by environment\n");
+ return;
+ }
+
+ DP("Loading RTLs...\n");
+
+ // Attempt to open all the plugins and, if they exist, check if the interface
+ // is correct and if they are supporting any devices.
+ for (auto *Name : RTLNames) {
+ DP("Loading library '%s'...\n", Name);
+ void *dynlib_handle = dlopen(Name, RTLD_NOW);
+
+ if (!dynlib_handle) {
+ // Library does not exist or cannot be found.
+ DP("Unable to load library '%s': %s!\n", Name, dlerror());
+ continue;
+ }
+
+ DP("Successfully loaded library '%s'!\n", Name);
+
+ // Retrieve the RTL information from the runtime library.
+ RTLInfoTy R;
+
+ R.LibraryHandler = dynlib_handle;
+ R.isUsed = false;
+
+#ifdef OMPTARGET_DEBUG
+ R.RTLName = Name;
+#endif
+
+ if (!(*((void**) &R.is_valid_binary) = dlsym(
+ dynlib_handle, "__tgt_rtl_is_valid_binary")))
+ continue;
+ if (!(*((void**) &R.number_of_devices) = dlsym(
+ dynlib_handle, "__tgt_rtl_number_of_devices")))
+ continue;
+ if (!(*((void**) &R.init_device) = dlsym(
+ dynlib_handle, "__tgt_rtl_init_device")))
+ continue;
+ if (!(*((void**) &R.load_binary) = dlsym(
+ dynlib_handle, "__tgt_rtl_load_binary")))
+ continue;
+ if (!(*((void**) &R.data_alloc) = dlsym(
+ dynlib_handle, "__tgt_rtl_data_alloc")))
+ continue;
+ if (!(*((void**) &R.data_submit) = dlsym(
+ dynlib_handle, "__tgt_rtl_data_submit")))
+ continue;
+ if (!(*((void**) &R.data_retrieve) = dlsym(
+ dynlib_handle, "__tgt_rtl_data_retrieve")))
+ continue;
+ if (!(*((void**) &R.data_delete) = dlsym(
+ dynlib_handle, "__tgt_rtl_data_delete")))
+ continue;
+ if (!(*((void**) &R.run_region) = dlsym(
+ dynlib_handle, "__tgt_rtl_run_target_region")))
+ continue;
+ if (!(*((void**) &R.run_team_region) = dlsym(
+ dynlib_handle, "__tgt_rtl_run_target_team_region")))
+ continue;
+
+ // No devices are supported by this RTL?
+ if (!(R.NumberOfDevices = R.number_of_devices())) {
+ DP("No devices supported in this RTL\n");
+ continue;
+ }
+
+ DP("Registering RTL %s supporting %d devices!\n",
+ R.RTLName.c_str(), R.NumberOfDevices);
+
+ // The RTL is valid! Will save the information in the RTLs list.
+ AllRTLs.push_back(R);
+ }
+
+ DP("RTLs loaded!\n");
+
+ return;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering libs
+
+static void RegisterImageIntoTranslationTable(TranslationTable &TT,
+ RTLInfoTy &RTL, __tgt_device_image *image) {
+
+ // same size, as when we increase one, we also increase the other.
+ assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
+ "We should have as many images as we have tables!");
+
+ // Resize the Targets Table and Images to accommodate the new targets if
+ // required
+ unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
+
+ if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
+ TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+ TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
+ }
+
+ // Register the image in all devices for this target type.
+ for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
+ // If we are changing the image we are also invalidating the target table.
+ if (TT.TargetsImages[RTL.Idx + i] != image) {
+ TT.TargetsImages[RTL.Idx + i] = image;
+ TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
+ }
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering Ctors/Dtors
+
+static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
+ __tgt_device_image *img, RTLInfoTy *RTL) {
+
+ for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
+ DeviceTy &Device = Devices[RTL->Idx + i];
+ Device.PendingGlobalsMtx.lock();
+ Device.HasPendingGlobals = true;
+ for (__tgt_offload_entry *entry = img->EntriesBegin;
+ entry != img->EntriesEnd; ++entry) {
+ if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
+ DP("Adding ctor " DPxMOD " to the pending list.\n",
+ DPxPTR(entry->addr));
+ Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
+ } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
+ // Dtors are pushed in reverse order so they are executed from end
+ // to beginning when unregistering the library!
+ DP("Adding dtor " DPxMOD " to the pending list.\n",
+ DPxPTR(entry->addr));
+ Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
+ }
+
+ if (entry->flags & OMP_DECLARE_TARGET_LINK) {
+ DP("The \"link\" attribute is not yet supported!\n");
+ }
+ }
+ Device.PendingGlobalsMtx.unlock();
+ }
+}
+
+void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
+ // Attempt to load all plugins available in the system.
+ std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
+
+ RTLsMtx.lock();
+ // Register the images with the RTLs that understand them, if any.
+ for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+ // Obtain the image.
+ __tgt_device_image *img = &desc->DeviceImages[i];
+
+ RTLInfoTy *FoundRTL = NULL;
+
+ // Scan the RTLs that have associated images until we find one that supports
+ // the current image.
+ for (auto &R : RTLs.AllRTLs) {
+ if (!R.is_valid_binary(img)) {
+ DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
+ DPxPTR(img->ImageStart), R.RTLName.c_str());
+ continue;
+ }
+
+ DP("Image " DPxMOD " is compatible with RTL %s!\n",
+ DPxPTR(img->ImageStart), R.RTLName.c_str());
+
+ // If this RTL is not already in use, initialize it.
+ if (!R.isUsed) {
+ // Initialize the device information for the RTL we are about to use.
+ DeviceTy device(&R);
+
+ size_t start = Devices.size();
+ Devices.resize(start + R.NumberOfDevices, device);
+ for (int32_t device_id = 0; device_id < R.NumberOfDevices;
+ device_id++) {
+ // global device ID
+ Devices[start + device_id].DeviceID = start + device_id;
+ // RTL local device ID
+ Devices[start + device_id].RTLDeviceID = device_id;
+
+ // Save pointer to device in RTL in case we want to unregister the RTL
+ R.Devices.push_back(&Devices[start + device_id]);
+ }
+
+ // Initialize the index of this RTL and save it in the used RTLs.
+ R.Idx = (RTLs.UsedRTLs.empty())
+ ? 0
+ : RTLs.UsedRTLs.back()->Idx +
+ RTLs.UsedRTLs.back()->NumberOfDevices;
+ assert((size_t) R.Idx == start &&
+ "RTL index should equal the number of devices used so far.");
+ R.isUsed = true;
+ RTLs.UsedRTLs.push_back(&R);
+
+ DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
+ }
+
+ // Initialize (if necessary) translation table for this library.
+ TrlTblMtx.lock();
+ if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
+ TranslationTable &tt =
+ HostEntriesBeginToTransTable[desc->HostEntriesBegin];
+ tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
+ tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
+ }
+
+ // Retrieve translation table for this library.
+ TranslationTable &TransTable =
+ HostEntriesBeginToTransTable[desc->HostEntriesBegin];
+
+ DP("Registering image " DPxMOD " with RTL %s!\n",
+ DPxPTR(img->ImageStart), R.RTLName.c_str());
+ RegisterImageIntoTranslationTable(TransTable, R, img);
+ TrlTblMtx.unlock();
+ FoundRTL = &R;
+
+ // Load ctors/dtors for static objects
+ RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
+
+ // if an RTL was found we are done - proceed to register the next image
+ break;
+ }
+
+ if (!FoundRTL) {
+ DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
+ }
+ }
+ RTLsMtx.unlock();
+
+
+ DP("Done registering entries!\n");
+}
+
+void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
+ DP("Unloading target library!\n");
+
+ RTLsMtx.lock();
+ // Find which RTL understands each image, if any.
+ for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+ // Obtain the image.
+ __tgt_device_image *img = &desc->DeviceImages[i];
+
+ RTLInfoTy *FoundRTL = NULL;
+
+ // Scan the RTLs that have associated images until we find one that supports
+ // the current image. We only need to scan RTLs that are already being used.
+ for (auto *R : RTLs.UsedRTLs) {
+
+ assert(R->isUsed && "Expecting used RTLs.");
+
+ if (!R->is_valid_binary(img)) {
+ DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
+ DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+ continue;
+ }
+
+ DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
+ DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+ FoundRTL = R;
+
+ // Execute dtors for static objects if the device has been used, i.e.
+ // if its PendingCtors list has been emptied.
+ for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
+ DeviceTy &Device = Devices[FoundRTL->Idx + i];
+ Device.PendingGlobalsMtx.lock();
+ if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
+ for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
+ int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
+ 1, true /*team*/);
+ if (rc != OFFLOAD_SUCCESS) {
+ DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
+ }
+ }
+ // Remove this library's entry from PendingCtorsDtors
+ Device.PendingCtorsDtors.erase(desc);
+ }
+ Device.PendingGlobalsMtx.unlock();
+ }
+
+ DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
+ DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+ break;
+ }
+
+ // if no RTL was found proceed to unregister the next image
+ if (!FoundRTL){
+ DP("No RTLs in use support the image " DPxMOD "!\n",
+ DPxPTR(img->ImageStart));
+ }
+ }
+ RTLsMtx.unlock();
+ DP("Done unregistering images!\n");
+
+ // Remove entries from HostPtrToTableMap
+ TblMapMtx.lock();
+ for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
+ cur < desc->HostEntriesEnd; ++cur) {
+ HostPtrToTableMap.erase(cur->addr);
+ }
+
+ // Remove translation table for this descriptor.
+ auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
+ if (tt != HostEntriesBeginToTransTable.end()) {
+ DP("Removing translation table for descriptor " DPxMOD "\n",
+ DPxPTR(desc->HostEntriesBegin));
+ HostEntriesBeginToTransTable.erase(tt);
+ } else {
+ DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
+ "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
+ }
+
+ TblMapMtx.unlock();
+
+ // TODO: Remove RTL and the devices it manages if it's not used anymore?
+ // TODO: Write some RTL->unload_image(...) function?
+
+ DP("Done unregistering library!\n");
+}
diff --git a/final/libomptarget/src/rtl.h b/final/libomptarget/src/rtl.h
new file mode 100644
index 0000000..dc3cd6d
--- /dev/null
+++ b/final/libomptarget/src/rtl.h
@@ -0,0 +1,166 @@
+//===------------ rtl.h - Target independent OpenMP target RTL ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_RTL_H
+#define _OMPTARGET_RTL_H
+
+#include <list>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// Forward declarations.
+struct DeviceTy;
+struct __tgt_bin_desc;
+
+struct RTLInfoTy {
+ typedef int32_t(is_valid_binary_ty)(void *);
+ typedef int32_t(number_of_devices_ty)();
+ typedef int32_t(init_device_ty)(int32_t);
+ typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
+ typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
+ typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
+ typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+ typedef int32_t(data_delete_ty)(int32_t, void *);
+ typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+ int32_t);
+ typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+ int32_t, int32_t, int32_t, uint64_t);
+
+ int32_t Idx; // RTL index, index is the number of devices
+ // of other RTLs that were registered before,
+ // i.e. the OpenMP index of the first device
+ // to be registered with this RTL.
+ int32_t NumberOfDevices; // Number of devices this RTL deals with.
+ std::vector<DeviceTy *> Devices; // one per device (NumberOfDevices in total).
+
+ void *LibraryHandler;
+
+#ifdef OMPTARGET_DEBUG
+ std::string RTLName;
+#endif
+
+ // Functions implemented in the RTL.
+ is_valid_binary_ty *is_valid_binary;
+ number_of_devices_ty *number_of_devices;
+ init_device_ty *init_device;
+ load_binary_ty *load_binary;
+ data_alloc_ty *data_alloc;
+ data_submit_ty *data_submit;
+ data_retrieve_ty *data_retrieve;
+ data_delete_ty *data_delete;
+ run_region_ty *run_region;
+ run_team_region_ty *run_team_region;
+
+ // Are there images associated with this RTL.
+ bool isUsed;
+
+ // Mutex for thread-safety when calling RTL interface functions.
+ // It is easier to enforce thread-safety at the libomptarget level,
+ // so that developers of new RTLs do not have to worry about it.
+ std::mutex Mtx;
+
+ // The existence of the mutex above makes RTLInfoTy non-copyable.
+ // We need to provide a copy constructor explicitly.
+ RTLInfoTy()
+ : Idx(-1), NumberOfDevices(-1), Devices(), LibraryHandler(0),
+#ifdef OMPTARGET_DEBUG
+ RTLName(),
+#endif
+ is_valid_binary(0), number_of_devices(0), init_device(0),
+ load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
+ data_delete(0), run_region(0), run_team_region(0), isUsed(false),
+ Mtx() {}
+
+ RTLInfoTy(const RTLInfoTy &r) : Mtx() {
+ Idx = r.Idx;
+ NumberOfDevices = r.NumberOfDevices;
+ Devices = r.Devices;
+ LibraryHandler = r.LibraryHandler;
+#ifdef OMPTARGET_DEBUG
+ RTLName = r.RTLName;
+#endif
+ is_valid_binary = r.is_valid_binary;
+ number_of_devices = r.number_of_devices;
+ init_device = r.init_device;
+ load_binary = r.load_binary;
+ data_alloc = r.data_alloc;
+ data_submit = r.data_submit;
+ data_retrieve = r.data_retrieve;
+ data_delete = r.data_delete;
+ run_region = r.run_region;
+ run_team_region = r.run_team_region;
+ isUsed = r.isUsed;
+ }
+};
+
+/// RTLs identified in the system.
+class RTLsTy {
+private:
+ // Mutex-like object to guarantee thread-safety and unique initialization
+ // (i.e. the library attempts to load the RTLs (plugins) only once).
+ std::once_flag initFlag;
+ void LoadRTLs(); // not thread-safe
+
+public:
+ // List of the detected runtime libraries.
+ std::list<RTLInfoTy> AllRTLs;
+
+ // Array of pointers to the detected runtime libraries that have compatible
+ // binaries.
+ std::vector<RTLInfoTy *> UsedRTLs;
+
+ explicit RTLsTy() {}
+
+ // Register a shared library with all (compatible) RTLs.
+ void RegisterLib(__tgt_bin_desc *desc);
+
+ // Unregister a shared library from all RTLs.
+ void UnregisterLib(__tgt_bin_desc *desc);
+};
+extern RTLsTy RTLs;
+extern std::mutex RTLsMtx;
+
+
+/// Map between the host entry begin and the translation table. Each
+/// registered library gets one TranslationTable. Use the map from
+/// __tgt_offload_entry so that we may quickly determine whether we
+/// are trying to (re)register an existing lib or really have a new one.
+struct TranslationTable {
+ __tgt_target_table HostTable;
+
+ // Image assigned to a given device.
+ std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
+
+ // Table of entry points or NULL if it was not already computed.
+ std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
+};
+typedef std::map<__tgt_offload_entry *, TranslationTable>
+ HostEntriesBeginToTransTableTy;
+extern HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+extern std::mutex TrlTblMtx;
+
+/// Map between the host ptr and a table index
+struct TableMap {
+ TranslationTable *Table; // table associated with the host ptr.
+ uint32_t Index; // index in which the host ptr translated entry is found.
+ TableMap() : Table(0), Index(0) {}
+ TableMap(TranslationTable *table, uint32_t index)
+ : Table(table), Index(index) {}
+};
+typedef std::map<void *, TableMap> HostPtrToTableMapTy;
+extern HostPtrToTableMapTy HostPtrToTableMap;
+extern std::mutex TblMapMtx;
+
+#endif
diff --git a/final/libomptarget/test/CMakeLists.txt b/final/libomptarget/test/CMakeLists.txt
new file mode 100644
index 0000000..4e9c7ab
--- /dev/null
+++ b/final/libomptarget/test/CMakeLists.txt
@@ -0,0 +1,28 @@
+# CMakeLists.txt file for unit testing OpenMP offloading runtime library.
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
+ OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
+ libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
+ libomptarget_warning_say("The check-libomptarget target will not be available!")
+ return()
+endif()
+
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+ set(LIBOMPTARGET_DEBUG True)
+else()
+ set(LIBOMPTARGET_DEBUG False)
+endif()
+
+add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp)
+
+if(${OPENMP_STANDALONE_BUILD})
+ set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../../runtime/src" CACHE STRING
+ "Path to folder containing omp.h")
+ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../../runtime/src" CACHE STRING
+ "Path to folder containing libomp.so")
+else()
+ set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${LIBOMPTARGET_BINARY_DIR}/../runtime/src")
+endif()
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/final/libomptarget/test/env/omp_target_debug.c b/final/libomptarget/test/env/omp_target_debug.c
new file mode 100644
index 0000000..ce84c98
--- /dev/null
+++ b/final/libomptarget/test/env/omp_target_debug.c
@@ -0,0 +1,20 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG
+// REQUIRES: libomptarget-debug
+
+int main(void) {
+#pragma omp target
+ {}
+ return 0;
+}
+
+// DEBUG: Libomptarget
+// NDEBUG-NOT: Libomptarget
+// NDEBUG-NOT: Target
+
diff --git a/final/libomptarget/test/lit.cfg b/final/libomptarget/test/lit.cfg
new file mode 100644
index 0000000..cc085d6
--- /dev/null
+++ b/final/libomptarget/test/lit.cfg
@@ -0,0 +1,137 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+ config = object()
+ lit_config = object()
+
+def append_dynamic_library_path(name, value, sep):
+ if name in config.environment:
+ config.environment[name] = value + sep + config.environment[name]
+ else:
+ config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomptarget_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+ " -I " + config.omp_header_directory + \
+ " -L " + config.library_dir;
+
+if config.omp_host_rtl_directory:
+ config.test_flags = config.test_flags + " -L " + \
+ config.omp_host_rtl_directory
+
+config.test_flags = config.test_flags + " " + config.test_extra_flags
+
+if config.libomptarget_debug:
+ config.available_features.add('libomptarget-debug')
+
+# Setup environment to find dynamic library at runtime
+if config.operating_system == 'Windows':
+ append_dynamic_library_path('PATH', config.library_dir, ";")
+ append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
+elif config.operating_system == 'Darwin':
+ append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
+ append_dynamic_library_path('DYLD_LIBRARY_PATH', \
+ config.omp_host_rtl_directory, ";")
+ config.test_flags += " -Wl,-rpath," + config.library_dir
+ config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
+else: # Unices
+ append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+ append_dynamic_library_path('LD_LIBRARY_PATH', \
+ config.omp_host_rtl_directory, ":")
+
+# substitutions
+# - for targets that exist in the system create the actual command.
+# - for valid targets that do not exist in the system, return false, so that the
+# same test can be used for different targets.
+
+# Scan all the valid targets.
+for libomptarget_target in config.libomptarget_all_targets:
+ # Is this target in the current system? If so create a compile, run and test
+ # command. Otherwise create command that return false.
+ if libomptarget_target in config.libomptarget_system_targets:
+ config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+ libomptarget_target, \
+ "%libomptarget-compilexx-and-run-" + libomptarget_target + \
+ " | " + config.libomptarget_filecheck + " %s"))
+ config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+ libomptarget_target, \
+ "%libomptarget-compile-and-run-" + libomptarget_target + \
+ " | " + config.libomptarget_filecheck + " %s"))
+ config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+ libomptarget_target, \
+ "%libomptarget-compilexx-" + libomptarget_target + " && " + \
+ "%libomptarget-run-" + libomptarget_target))
+ config.substitutions.append(("%libomptarget-compile-and-run-" + \
+ libomptarget_target, \
+ "%libomptarget-compile-" + libomptarget_target + " && " + \
+ "%libomptarget-run-" + libomptarget_target))
+ config.substitutions.append(("%libomptarget-compilexx-" + \
+ libomptarget_target, \
+ "%clangxx-" + libomptarget_target + " %s -o %t-" + \
+ libomptarget_target))
+ config.substitutions.append(("%libomptarget-compile-" + \
+ libomptarget_target, \
+ "%clang-" + libomptarget_target + " %s -o %t-" + \
+ libomptarget_target))
+ config.substitutions.append(("%libomptarget-run-" + \
+ libomptarget_target, \
+ "%t-" + libomptarget_target))
+ config.substitutions.append(("%clangxx-" + libomptarget_target, \
+ "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+ config.substitutions.append(("%clang-" + libomptarget_target, \
+ "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+ config.substitutions.append(("%fcheck-" + libomptarget_target, \
+ config.libomptarget_filecheck + " %s"))
+ else:
+ config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-compile-and-run-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-compilexx-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-compile-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%libomptarget-run-" + \
+ libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%clang-" + libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%clangxx-" + libomptarget_target, \
+ "echo ignored-command"))
+ config.substitutions.append(("%fcheck-" + libomptarget_target, \
+ "echo ignored-command"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
diff --git a/final/libomptarget/test/lit.site.cfg.in b/final/libomptarget/test/lit.site.cfg.in
new file mode 100644
index 0000000..6b27c4b
--- /dev/null
+++ b/final/libomptarget/test/lit.site.cfg.in
@@ -0,0 +1,18 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
+config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
+config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@LIBOMPTARGET_BASE_DIR@/test/lit.cfg")
diff --git a/final/libomptarget/test/offloading/offloading_success.c b/final/libomptarget/test/offloading/offloading_success.c
new file mode 100644
index 0000000..12e78fa
--- /dev/null
+++ b/final/libomptarget/test/offloading/offloading_success.c
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+ int isHost = -1;
+
+#pragma omp target map(from: isHost)
+ { isHost = omp_is_initial_device(); }
+
+ if (isHost < 0) {
+ printf("Runtime error, isHost=%d\n", isHost);
+ }
+
+ // CHECK: Target region executed on the device
+ printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+ return isHost;
+}
diff --git a/final/libomptarget/test/offloading/offloading_success.cpp b/final/libomptarget/test/offloading/offloading_success.cpp
new file mode 100644
index 0000000..eecd97a
--- /dev/null
+++ b/final/libomptarget/test/offloading/offloading_success.cpp
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+ int isHost = 0;
+
+#pragma omp target map(from: isHost)
+ { isHost = omp_is_initial_device(); }
+
+ if (isHost < 0) {
+ printf("Runtime error, isHost=%d\n", isHost);
+ }
+
+ // CHECK: Target region executed on the device
+ printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+ return isHost;
+}
diff --git a/final/runtime/.clang-format b/final/runtime/.clang-format
new file mode 100644
index 0000000..590e1e2
--- /dev/null
+++ b/final/runtime/.clang-format
@@ -0,0 +1,5 @@
+---
+BasedOnStyle: LLVM
+AlignTrailingComments: false
+SortIncludes: false
+...
diff --git a/final/runtime/CMakeLists.txt b/final/runtime/CMakeLists.txt
new file mode 100644
index 0000000..447b3cd
--- /dev/null
+++ b/final/runtime/CMakeLists.txt
@@ -0,0 +1,397 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+ message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
+endif()
+
+# Add cmake directory to search for custom cmake functions
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# Set libomp version
+set(LIBOMP_VERSION_MAJOR 5)
+set(LIBOMP_VERSION_MINOR 0)
+
+# These include files are in the cmake/ subdirectory
+include(LibompUtils)
+include(LibompGetArchitecture)
+include(LibompHandleFlags)
+include(LibompDefinitions)
+
+# Determine the target architecture
+if(${OPENMP_STANDALONE_BUILD})
+ # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake
+ libomp_get_architecture(LIBOMP_DETECTED_ARCH)
+ set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING
+ "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64).")
+ # Should assertions be enabled? They are on by default.
+ set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
+ "enable assertions?")
+else() # Part of LLVM build
+ # Determine the native architecture from LLVM.
+ string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
+ if( LIBOMP_NATIVE_ARCH STREQUAL "host" )
+ string(REGEX MATCH "^[^-]*" LIBOMP_NATIVE_ARCH ${LLVM_HOST_TRIPLE})
+ endif ()
+ if(LIBOMP_NATIVE_ARCH MATCHES "i[2-6]86")
+ set(LIBOMP_ARCH i386)
+ elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86")
+ set(LIBOMP_ARCH i386)
+ elseif(LIBOMP_NATIVE_ARCH STREQUAL "amd64")
+ set(LIBOMP_ARCH x86_64)
+ elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86_64")
+ set(LIBOMP_ARCH x86_64)
+ elseif(LIBOMP_NATIVE_ARCH MATCHES "powerpc64le")
+ set(LIBOMP_ARCH ppc64le)
+ elseif(LIBOMP_NATIVE_ARCH MATCHES "powerpc")
+ set(LIBOMP_ARCH ppc64)
+ elseif(LIBOMP_NATIVE_ARCH MATCHES "aarch64")
+ set(LIBOMP_ARCH aarch64)
+ elseif(LIBOMP_NATIVE_ARCH MATCHES "arm64")
+ set(LIBOMP_ARCH aarch64)
+ elseif(LIBOMP_NATIVE_ARCH MATCHES "arm")
+ set(LIBOMP_ARCH arm)
+ else()
+ # last ditch effort
+ libomp_get_architecture(LIBOMP_ARCH)
+ endif ()
+ set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
+endif()
+libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 mic mips mips64)
+
+set(LIBOMP_LIB_TYPE normal CACHE STRING
+ "Performance,Profiling,Stubs library (normal/profile/stubs)")
+libomp_check_variable(LIBOMP_LIB_TYPE normal profile stubs)
+set(LIBOMP_OMP_VERSION 50 CACHE STRING
+ "The OpenMP version (50/45/40/30)")
+libomp_check_variable(LIBOMP_OMP_VERSION 50 45 40 30)
+# Set the OpenMP Year and Month assiociated with version
+if(${LIBOMP_OMP_VERSION} GREATER 50 OR ${LIBOMP_OMP_VERSION} EQUAL 50)
+ set(LIBOMP_OMP_YEAR_MONTH 201611)
+elseif(${LIBOMP_OMP_VERSION} GREATER 45 OR ${LIBOMP_OMP_VERSION} EQUAL 45)
+ set(LIBOMP_OMP_YEAR_MONTH 201511)
+elseif(${LIBOMP_OMP_VERSION} GREATER 40 OR ${LIBOMP_OMP_VERSION} EQUAL 40)
+ set(LIBOMP_OMP_YEAR_MONTH 201307)
+elseif(${LIBOMP_OMP_VERSION} GREATER 30 OR ${LIBOMP_OMP_VERSION} EQUAL 30)
+ set(LIBOMP_OMP_YEAR_MONTH 201107)
+else()
+ set(LIBOMP_OMP_YEAR_MONTH 200505)
+endif()
+set(LIBOMP_MIC_ARCH knc CACHE STRING
+ "Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) (knf/knc). Ignored if not Intel(R) MIC Architecture build.")
+if("${LIBOMP_ARCH}" STREQUAL "mic")
+ libomp_check_variable(LIBOMP_MIC_ARCH knf knc)
+endif()
+set(LIBOMP_FORTRAN_MODULES FALSE CACHE BOOL
+ "Create Fortran module files? (requires fortran compiler)")
+
+# - Support for universal fat binary builds on Mac
+# - Having this extra variable allows people to build this library as a universal library
+# without forcing a universal build of the llvm/clang compiler.
+set(LIBOMP_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}" CACHE STRING
+ "For Mac builds, semicolon separated list of architectures to build for universal fat binary.")
+set(CMAKE_OSX_ARCHITECTURES ${LIBOMP_OSX_ARCHITECTURES})
+
+# Should @rpath be used for dynamic libraries on Mac?
+# The if(NOT DEFINED) is there to guard a cached value of the variable if one
+# exists so there is no interference with what the user wants. Also, no cache entry
+# is created so there are no inadvertant effects on other parts of LLVM.
+if(NOT DEFINED CMAKE_MACOSX_RPATH)
+ set(CMAKE_MACOSX_RPATH TRUE)
+endif()
+
+# User specified flags. These are appended to the configured flags.
+set(LIBOMP_CFLAGS "" CACHE STRING
+ "Appended user specified C compiler flags.")
+set(LIBOMP_CXXFLAGS "" CACHE STRING
+ "Appended user specified C++ compiler flags.")
+set(LIBOMP_CPPFLAGS "" CACHE STRING
+ "Appended user specified C preprocessor flags.")
+set(LIBOMP_ASMFLAGS "" CACHE STRING
+ "Appended user specified assembler flags.")
+set(LIBOMP_LDFLAGS "" CACHE STRING
+ "Appended user specified linker flags.")
+set(LIBOMP_LIBFLAGS "" CACHE STRING
+ "Appended user specified linked libs flags. (e.g., -lm)")
+set(LIBOMP_FFLAGS "" CACHE STRING
+ "Appended user specified Fortran compiler flags. These are only used if LIBOMP_FORTRAN_MODULES==TRUE.")
+
+# Should the libomp library and generated headers be copied into the original source exports/ directory
+# Turning this to FALSE aids parallel builds to not interfere with each other.
+# Currently, the testsuite module expects the just built OpenMP library to be located inside the exports/
+# directory. TODO: have testsuite run under llvm-lit directly. We can then get rid of copying to exports/
+set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING
+ "Should exports be copied into source exports/ directory?")
+
+# HWLOC-support
+set(LIBOMP_USE_HWLOC FALSE CACHE BOOL
+ "Use Hwloc (http://www.open-mpi.org/projects/hwloc/) library for affinity?")
+set(LIBOMP_HWLOC_INSTALL_DIR /usr/local CACHE PATH
+ "Install path for hwloc library")
+
+# Get the build number from kmp_version.cpp
+libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_VERSION_BUILD)
+math(EXPR LIBOMP_VERSION_BUILD_YEAR "${LIBOMP_VERSION_BUILD}/10000")
+math(EXPR LIBOMP_VERSION_BUILD_MONTH_DAY "${LIBOMP_VERSION_BUILD}%10000")
+
+# Currently don't record any timestamps
+set(LIBOMP_BUILD_DATE "No_Timestamp")
+
+# Architecture
+set(IA32 FALSE)
+set(INTEL64 FALSE)
+set(ARM FALSE)
+set(AARCH64 FALSE)
+set(PPC64BE FALSE)
+set(PPC64LE FALSE)
+set(PPC64 FALSE)
+set(MIC FALSE)
+set(MIPS64 FALSE)
+set(MIPS FALSE)
+if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32") # IA-32 architecture
+ set(IA32 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture
+ set(INTEL64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "arm") # ARM architecture
+ set(ARM TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64") # PPC64BE architecture
+ set(PPC64BE TRUE)
+ set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64le") # PPC64LE architecture
+ set(PPC64LE TRUE)
+ set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "aarch64") # AARCH64 architecture
+ set(AARCH64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mic") # Intel(R) Many Integrated Core Architecture
+ set(MIC TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mips") # MIPS architecture
+ set(MIPS TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mips64") # MIPS64 architecture
+ set(MIPS64 TRUE)
+endif()
+
+# Set some flags based on build_type
+set(RELEASE_BUILD FALSE)
+set(DEBUG_BUILD FALSE)
+set(RELWITHDEBINFO_BUILD FALSE)
+set(MINSIZEREL_BUILD FALSE)
+string(TOLOWER "${CMAKE_BUILD_TYPE}" libomp_build_type_lowercase)
+if("${libomp_build_type_lowercase}" STREQUAL "release")
+ set(RELEASE_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "debug")
+ set(DEBUG_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "relwithdebinfo")
+ set(RELWITHDEBINFO_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "minsizerel")
+ set(MINSIZEREL_BUILD TRUE)
+endif()
+
+# Include itt notify interface?
+set(LIBOMP_USE_ITT_NOTIFY TRUE CACHE BOOL
+ "Enable ITT notify?")
+
+# normal, profile, stubs library.
+set(NORMAL_LIBRARY FALSE)
+set(STUBS_LIBRARY FALSE)
+set(PROFILE_LIBRARY FALSE)
+if("${LIBOMP_LIB_TYPE}" STREQUAL "normal")
+ set(NORMAL_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "profile")
+ set(PROFILE_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "stubs")
+ set(STUBS_LIBRARY TRUE)
+endif()
+
+# Setting directory names
+set(LIBOMP_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(LIBOMP_SRC_DIR ${LIBOMP_BASE_DIR}/src)
+set(LIBOMP_TOOLS_DIR ${LIBOMP_BASE_DIR}/tools)
+set(LIBOMP_INC_DIR ${LIBOMP_SRC_DIR}/include/${LIBOMP_OMP_VERSION})
+set(LIBOMP_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# Enabling Fortran if it is needed
+if(${LIBOMP_FORTRAN_MODULES})
+ enable_language(Fortran)
+endif()
+# Enable MASM Compiler if it is needed (Windows only)
+if(WIN32)
+ enable_language(ASM_MASM)
+endif()
+
+# Getting legal type/arch
+libomp_get_legal_type(LIBOMP_LEGAL_TYPE)
+libomp_get_legal_arch(LIBOMP_LEGAL_ARCH)
+
+# Compiler flag checks, library checks, threading check, etc.
+include(config-ix)
+
+# Is there a quad precision data type available?
+# TODO: Make this a real feature check
+set(LIBOMP_USE_QUAD_PRECISION "${LIBOMP_HAVE_QUAD_PRECISION}" CACHE BOOL
+ "Should 128-bit precision entry points be built?")
+if(LIBOMP_USE_QUAD_PRECISION AND (NOT LIBOMP_HAVE_QUAD_PRECISION))
+ libomp_error_say("128-bit quad precision functionality requested but not available")
+endif()
+
+# libgomp drop-in compatibility requires versioned symbols
+set(LIBOMP_USE_VERSION_SYMBOLS "${LIBOMP_HAVE_VERSION_SYMBOLS}" CACHE BOOL
+ "Should version symbols be used? These provide binary compatibility with libgomp.")
+if(LIBOMP_USE_VERSION_SYMBOLS AND (NOT LIBOMP_HAVE_VERSION_SYMBOLS))
+ libomp_error_say("Version symbols functionality requested but not available")
+endif()
+
+# On multinode systems, larger alignment is desired to avoid false sharing
+set(LIBOMP_USE_INTERNODE_ALIGNMENT FALSE CACHE BOOL
+ "Should larger alignment (4096 bytes) be used for some locks and data structures?")
+
+# Build code that allows the OpenMP library to conveniently interface with debuggers
+set(LIBOMP_USE_DEBUGGER FALSE CACHE BOOL
+ "Enable debugger interface code?")
+
+# Should we link to C++ library?
+set(LIBOMP_USE_STDCPPLIB FALSE CACHE BOOL
+ "Should we link to C++ library?")
+
+# Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) based locks have
+# __asm code which can be troublesome for some compilers. This feature is also x86 specific.
+# TODO: Make this a real feature check
+set(LIBOMP_USE_ADAPTIVE_LOCKS "${LIBOMP_HAVE_ADAPTIVE_LOCKS}" CACHE BOOL
+ "Should Intel(R) TSX lock be compiled (adaptive lock in kmp_lock.cpp). These are x86 specific.")
+if(LIBOMP_USE_ADAPTIVE_LOCKS AND (NOT LIBOMP_HAVE_ADAPTIVE_LOCKS))
+ libomp_error_say("Adaptive locks (Intel(R) TSX) functionality is only supported on x86 Architecture")
+endif()
+
+# - stats-gathering enables OpenMP stats where things like the number of
+# parallel regions, clock ticks spent in particular openmp regions are recorded.
+set(LIBOMP_STATS FALSE CACHE BOOL
+ "Stats-Gathering functionality?")
+if(LIBOMP_STATS AND (NOT LIBOMP_HAVE_STATS))
+ libomp_error_say("Stats-gathering functionality requested but not available")
+endif()
+# The stats functionality requires the std c++ library
+if(LIBOMP_STATS)
+ set(LIBOMP_USE_STDCPPLIB TRUE)
+endif()
+
+# Shared library can be switched to a static library
+set(LIBOMP_ENABLE_SHARED TRUE CACHE BOOL
+ "Shared library instead of static library?")
+
+if(WIN32 AND NOT LIBOMP_ENABLE_SHARED)
+ libomp_error_say("Static libraries requested but not available on Windows")
+endif()
+
+if(LIBOMP_USE_ITT_NOTIFY AND NOT LIBOMP_ENABLE_SHARED)
+ message(STATUS "ITT Notify not supported for static libraries - forcing ITT Notify off")
+ set(LIBOMP_USE_ITT_NOTIFY FALSE)
+endif()
+
+if(LIBOMP_USE_VERSION_SYMBOLS AND (NOT LIBOMP_ENABLE_SHARED) )
+ message(STATUS "Version symbols not supported for static libraries - forcing Version symbols functionality off")
+ set (LIBOMP_USE_VERSION_SYMBOLS FALSE)
+endif()
+
+# OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in
+# cmake/config-ix.cmake are fulfilled.
+set(OMPT_DEFAULT FALSE)
+if ((${LIBOMP_OMP_VERSION} GREATER 49) AND (LIBOMP_HAVE_OMPT_SUPPORT) AND (NOT WIN32))
+ set(OMPT_DEFAULT TRUE)
+endif()
+set(LIBOMP_OMPT_SUPPORT ${OMPT_DEFAULT} CACHE BOOL
+ "OMPT-support?")
+
+set(LIBOMP_OMPT_DEBUG FALSE CACHE BOOL
+ "Trace OMPT initialization?")
+set(LIBOMP_OMPT_OPTIONAL TRUE CACHE BOOL
+ "OMPT-optional?")
+if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
+ libomp_error_say("OpenMP Tools Interface requested but not available in this implementation")
+endif()
+if(LIBOMP_OMPT_SUPPORT AND (${LIBOMP_OMP_VERSION} LESS 50))
+ libomp_error_say("OpenMP Tools Interface only available with OpenMP 5.0, LIBOMP_OMP_VERSION is ${LIBOMP_OMP_VERSION}")
+endif()
+
+# TSAN-support
+set(LIBOMP_TSAN_SUPPORT FALSE CACHE BOOL
+ "TSAN-support?")
+if(LIBOMP_TSAN_SUPPORT AND (NOT LIBOMP_HAVE_TSAN_SUPPORT))
+ libomp_error_say("TSAN functionality requested but not available")
+endif()
+
+# Error check hwloc support after config-ix has run
+if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
+ libomp_error_say("Hwloc requested but not available")
+endif()
+
+# Hierarchical scheduling support
+set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
+ "Hierarchical scheduling support?")
+
+# Setting final library name
+set(LIBOMP_DEFAULT_LIB_NAME libomp)
+if(${PROFILE_LIBRARY})
+ set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}prof)
+endif()
+if(${STUBS_LIBRARY})
+ set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}stubs)
+endif()
+set(LIBOMP_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME} CACHE STRING "Base OMP library name")
+
+if(${LIBOMP_ENABLE_SHARED})
+ set(LIBOMP_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+ set(LIBOMP_LIBRARY_KIND SHARED)
+ set(LIBOMP_INSTALL_KIND LIBRARY)
+else()
+ set(LIBOMP_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+ set(LIBOMP_LIBRARY_KIND STATIC)
+ set(LIBOMP_INSTALL_KIND ARCHIVE)
+endif()
+
+set(LIBOMP_LIB_FILE ${LIBOMP_LIB_NAME}${LIBOMP_LIBRARY_SUFFIX})
+
+# Optional backwards compatibility aliases.
+set(LIBOMP_INSTALL_ALIASES TRUE CACHE BOOL
+ "Install libgomp and libiomp5 library aliases for backwards compatibility")
+
+# Print configuration after all variables are set.
+if(${OPENMP_STANDALONE_BUILD})
+ libomp_say("Operating System -- ${CMAKE_SYSTEM_NAME}")
+ libomp_say("Target Architecture -- ${LIBOMP_ARCH}")
+ if(${MIC})
+ libomp_say("Intel(R) MIC Architecture -- ${LIBOMP_MIC_ARCH}")
+ endif()
+ libomp_say("Build Type -- ${CMAKE_BUILD_TYPE}")
+ libomp_say("OpenMP Version -- ${LIBOMP_OMP_VERSION}")
+ libomp_say("Library Kind -- ${LIBOMP_LIBRARY_KIND}")
+ libomp_say("Library Type -- ${LIBOMP_LIB_TYPE}")
+ libomp_say("Fortran Modules -- ${LIBOMP_FORTRAN_MODULES}")
+ # will say development if all zeros
+ if(${LIBOMP_VERSION_BUILD} STREQUAL 00000000)
+ set(LIBOMP_BUILD Development)
+ else()
+ set(LIBOMP_BUILD ${LIBOMP_VERSION_BUILD})
+ endif()
+ libomp_say("Build -- ${LIBOMP_BUILD}")
+ libomp_say("Use Stats-gathering -- ${LIBOMP_STATS}")
+ libomp_say("Use Debugger-support -- ${LIBOMP_USE_DEBUGGER}")
+ libomp_say("Use ITT notify -- ${LIBOMP_USE_ITT_NOTIFY}")
+ libomp_say("Use OMPT-support -- ${LIBOMP_OMPT_SUPPORT}")
+ if(${LIBOMP_OMPT_SUPPORT})
+ libomp_say("Use OMPT-optional -- ${LIBOMP_OMPT_OPTIONAL}")
+ endif()
+ libomp_say("Use Adaptive locks -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
+ libomp_say("Use quad precision -- ${LIBOMP_USE_QUAD_PRECISION}")
+ libomp_say("Use TSAN-support -- ${LIBOMP_TSAN_SUPPORT}")
+ libomp_say("Use Hwloc library -- ${LIBOMP_USE_HWLOC}")
+endif()
+
+add_subdirectory(src)
+add_subdirectory(test)
diff --git a/final/runtime/README.txt b/final/runtime/README.txt
new file mode 100644
index 0000000..ab19634
--- /dev/null
+++ b/final/runtime/README.txt
@@ -0,0 +1,116 @@
+
+ README for the LLVM* OpenMP* Runtime Library
+ ============================================
+
+How to Build Documentation
+==========================
+
+The main documentation is in Doxygen* format, and this distribution
+should come with pre-built PDF documentation in doc/Reference.pdf.
+However, an HTML version can be built by executing:
+
+% doxygen doc/doxygen/config
+
+in the runtime directory.
+
+That will produce HTML documentation in the doc/doxygen/generated
+directory, which can be accessed by pointing a web browser at the
+index.html file there.
+
+If you don't have Doxygen installed, you can download it from
+www.doxygen.org.
+
+
+How to Build the LLVM* OpenMP* Runtime Library
+==============================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omp
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp
+$ cd where-you-want-to-live/openmp/runtime
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst in the parent directory.
+
+Architectures Supported
+=======================
+* IA-32 architecture
+* Intel(R) 64 architecture
+* Intel(R) Many Integrated Core Architecture
+* ARM* architecture
+* Aarch64 (64-bit ARM) architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* MIPS and MIPS64 architecture
+
+Supported RTL Build Configurations
+==================================
+
+Supported Architectures: IA-32 architecture, Intel(R) 64, and
+Intel(R) Many Integrated Core Architecture
+
+ ----------------------------------------------
+ | icc/icl | gcc | clang |
+--------------|---------------|----------------------------|
+| Linux* OS | Yes(1,5) | Yes(2,4) | Yes(4,6,7) |
+| FreeBSD* | No | No | Yes(4,6,7,8) |
+| OS X* | Yes(1,3,4) | No | Yes(4,6,7) |
+| Windows* OS | Yes(1,4) | No | No |
+------------------------------------------------------------
+
+(1) On IA-32 architecture and Intel(R) 64, icc/icl versions 12.x are
+ supported (12.1 is recommended).
+(2) GCC* version 4.7 is supported.
+(3) For icc on OS X*, OS X* version 10.5.8 is supported.
+(4) Intel(R) Many Integrated Core Architecture not supported.
+(5) On Intel(R) Many Integrated Core Architecture, icc/icl versions 13.0
+ or later are required.
+(6) Clang* version 3.3 is supported.
+(7) Clang* currently does not offer a software-implemented 128 bit extended
+ precision type. Thus, all entry points reliant on this type are removed
+ from the library and cannot be called in the user program. The following
+ functions are not available:
+ __kmpc_atomic_cmplx16_*
+ __kmpc_atomic_float16_*
+ __kmpc_atomic_*_fp
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Supported Architectures: IBM(R) Power 7 and Power 8
+
+ -----------------------------
+ | gcc | clang |
+--------------|------------|--------------|
+| Linux* OS | Yes(1,2) | Yes(3,4) |
+-------------------------------------------
+
+(1) On Power 7, gcc version 4.8.2 is supported.
+(2) On Power 8, gcc version 4.8.2 is supported.
+(3) On Power 7, clang version 3.7 is supported.
+(4) On Power 8, clang version 3.7 is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: clang (from the OpenMP development branch at
+http://clang-omp.github.io/ ), Intel compilers, GCC. See the documentation
+for more details.
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+
+*Other names and brands may be claimed as the property of others.
diff --git a/final/runtime/cmake/LibompCheckFortranFlag.cmake b/final/runtime/cmake/LibompCheckFortranFlag.cmake
new file mode 100644
index 0000000..c37b3ad
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckFortranFlag.cmake
@@ -0,0 +1,73 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a fortran compiler flag
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise false.
+function(libomp_check_fortran_flag flag boolean)
+ if(NOT DEFINED "${boolean}")
+ set(retval TRUE)
+ set(fortran_source
+" program hello
+ print *, \"Hello World!\"
+ end program hello")
+
+ set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping")
+ if(CMAKE_VERSION VERSION_GREATER 3.1 OR CMAKE_VERSION VERSION_EQUAL 3.1)
+ include(CheckFortranSourceCompiles)
+ check_fortran_source_compiles("${fortran_source}" ${boolean} FAIL_REGEX "${failed_regexes}")
+ set(${boolean} ${${boolean}} PARENT_SCOPE)
+ return()
+ else()
+ # Our manual check for cmake versions that don't have CheckFortranSourceCompiles
+ set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/fortran_flag_check)
+ file(MAKE_DIRECTORY ${base_dir})
+ file(WRITE ${base_dir}/fortran_source.f "${fortran_source}")
+
+ message(STATUS "Performing Test ${boolean}")
+ execute_process(
+ COMMAND ${CMAKE_Fortran_COMPILER} "${flag}" ${base_dir}/fortran_source.f
+ WORKING_DIRECTORY ${base_dir}
+ RESULT_VARIABLE exit_code
+ OUTPUT_VARIABLE OUTPUT
+ ERROR_VARIABLE OUTPUT
+ )
+
+ if(${exit_code} EQUAL 0)
+ foreach(regex IN LISTS failed_regexes)
+ if("${OUTPUT}" MATCHES ${regex})
+ set(retval FALSE)
+ endif()
+ endforeach()
+ else()
+ set(retval FALSE)
+ endif()
+
+ if(${retval})
+ set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+ message(STATUS "Performing Test ${boolean} - Success")
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+ "Performing Fortran Compiler Flag test ${boolean} succeeded with the following output:\n"
+ "${OUTPUT}\n"
+ "Source file was:\n${fortran_source}\n")
+ else()
+ set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+ message(STATUS "Performing Test ${boolean} - Failed")
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+ "Performing Fortran Compiler Flag test ${boolean} failed with the following output:\n"
+ "${OUTPUT}\n"
+ "Source file was:\n${fortran_source}\n")
+ endif()
+ endif()
+
+ set(${boolean} ${retval} PARENT_SCOPE)
+ endif()
+endfunction()
diff --git a/final/runtime/cmake/LibompCheckLinkerFlag.cmake b/final/runtime/cmake/LibompCheckLinkerFlag.cmake
new file mode 100644
index 0000000..75a38e3
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckLinkerFlag.cmake
@@ -0,0 +1,68 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a linker flag to build a shared library
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise FALSE.
+function(libomp_check_linker_flag flag boolean)
+ if(NOT DEFINED "${boolean}")
+ set(retval TRUE)
+ set(library_source
+ "int foo(int a) { return a*a; }")
+ set(cmake_source
+ "cmake_minimum_required(VERSION 2.8)
+ project(foo C)
+ set(CMAKE_SHARED_LINKER_FLAGS \"${flag}\")
+ add_library(foo SHARED src_to_link.c)")
+ set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping;LINK : warning")
+ set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/link_flag_check_${boolean})
+ file(MAKE_DIRECTORY ${base_dir})
+ file(MAKE_DIRECTORY ${base_dir}/build)
+ file(WRITE ${base_dir}/src_to_link.c "${library_source}")
+ file(WRITE ${base_dir}/CMakeLists.txt "${cmake_source}")
+
+ message(STATUS "Performing Test ${boolean}")
+ try_compile(
+ try_compile_result
+ ${base_dir}/build
+ ${base_dir}
+ foo
+ OUTPUT_VARIABLE OUTPUT)
+
+ if(try_compile_result)
+ foreach(regex IN LISTS failed_regexes)
+ if("${OUTPUT}" MATCHES ${regex})
+ set(retval FALSE)
+ endif()
+ endforeach()
+ else()
+ set(retval FALSE)
+ endif()
+
+ if(${retval})
+ set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+ message(STATUS "Performing Test ${boolean} - Success")
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+ "Performing C Linker Flag test ${boolean} succeeded with the following output:\n"
+ "${OUTPUT}\n"
+ "Source file was:\n${library_source}\n")
+ else()
+ set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+ message(STATUS "Performing Test ${boolean} - Failed")
+ file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+ "Performing C Linker Flag test ${boolean} failed with the following output:\n"
+ "${OUTPUT}\n"
+ "Source file was:\n${library_source}\n")
+ endif()
+
+ set(${boolean} ${retval} PARENT_SCOPE)
+ endif()
+endfunction()
diff --git a/final/runtime/cmake/LibompDefinitions.cmake b/final/runtime/cmake/LibompDefinitions.cmake
new file mode 100644
index 0000000..c4cfbb9
--- /dev/null
+++ b/final/runtime/cmake/LibompDefinitions.cmake
@@ -0,0 +1,32 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+function(libomp_get_definitions_flags cppflags)
+ set(cppflags_local)
+
+ if(WIN32)
+ libomp_append(cppflags_local "-D _CRT_SECURE_NO_WARNINGS")
+ libomp_append(cppflags_local "-D _CRT_SECURE_NO_DEPRECATE")
+ libomp_append(cppflags_local "-D _WINDOWS")
+ libomp_append(cppflags_local "-D _WINNT")
+ libomp_append(cppflags_local "-D _WIN32_WINNT=0x0501")
+ libomp_append(cppflags_local "-D _USRDLL")
+ libomp_append(cppflags_local "-D _ITERATOR_DEBUG_LEVEL=0" IF_TRUE DEBUG_BUILD)
+ libomp_append(cppflags_local "-D _DEBUG" IF_TRUE DEBUG_BUILD)
+ else()
+ libomp_append(cppflags_local "-D _GNU_SOURCE")
+ libomp_append(cppflags_local "-D _REENTRANT")
+ endif()
+
+ # CMake doesn't include CPPFLAGS from environment, but we will.
+ set(${cppflags} ${cppflags_local} ${LIBOMP_CPPFLAGS} $ENV{CPPFLAGS} PARENT_SCOPE)
+endfunction()
+
diff --git a/final/runtime/cmake/LibompExports.cmake b/final/runtime/cmake/LibompExports.cmake
new file mode 100644
index 0000000..1893b9f
--- /dev/null
+++ b/final/runtime/cmake/LibompExports.cmake
@@ -0,0 +1,99 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# LibompExports.cmake
+# Copy library and header files into the exports/ subdirectory after library build
+
+# Create the suffix for the export directory
+# - Only add to suffix when not a default value
+# - Example suffix: .deb.30.s1
+# final export directory: exports/lin_32e.deb.30.s1/lib
+# - These suffixes imply the build is a Debug, OpenMP 3.0, Stats-Gathering version of the library
+set(libomp_suffix)
+libomp_append(libomp_suffix .deb DEBUG_BUILD)
+libomp_append(libomp_suffix .dia RELWITHDEBINFO_BUILD)
+libomp_append(libomp_suffix .min MINSIZEREL_BUILD)
+if(NOT "${LIBOMP_OMP_VERSION}" STREQUAL "45")
+ libomp_append(libomp_suffix .${LIBOMP_OMP_VERSION})
+endif()
+libomp_append(libomp_suffix .s1 LIBOMP_STATS)
+libomp_append(libomp_suffix .ompt LIBOMP_OMPT_SUPPORT)
+if(${LIBOMP_OMPT_SUPPORT})
+ libomp_append(libomp_suffix .optional LIBOMP_OMPT_OPTIONAL)
+endif()
+string(REPLACE ";" "" libomp_suffix "${libomp_suffix}")
+
+# Set exports locations
+if(${MIC})
+ set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_MIC_ARCH}") # e.g., lin_knf, lin_knc
+else()
+ if(${IA32})
+ set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32")
+ elseif(${INTEL64})
+ set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32e")
+ else()
+ set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_ARCH}") # e.g., lin_arm, lin_ppc64
+ endif()
+endif()
+set(LIBOMP_EXPORTS_DIR "${LIBOMP_BASE_DIR}/exports")
+set(LIBOMP_EXPORTS_PLATFORM_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}")
+set(LIBOMP_EXPORTS_CMN_DIR "${LIBOMP_EXPORTS_DIR}/common${libomp_suffix}/include")
+set(LIBOMP_EXPORTS_INC_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include")
+set(LIBOMP_EXPORTS_MOD_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include_compat")
+set(LIBOMP_EXPORTS_LIB_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}/lib")
+
+# Put headers in exports/ directory post build
+add_custom_command(TARGET omp POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_CMN_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy omp.h ${LIBOMP_EXPORTS_CMN_DIR}
+)
+if(${LIBOMP_OMPT_SUPPORT})
+ add_custom_command(TARGET omp POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy ompt.h ${LIBOMP_EXPORTS_CMN_DIR}
+ )
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+ add_custom_command(TARGET libomp-mod POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_MOD_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
+ )
+ add_custom_command(TARGET omp POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy omp_lib.h ${LIBOMP_EXPORTS_CMN_DIR}
+ )
+endif()
+
+# Copy OpenMP library into exports/ directory post build
+if(WIN32)
+ get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+else()
+ get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+ set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+add_custom_command(TARGET omp POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+)
+
+# Copy Windows import library into exports/ directory post build
+if(WIN32)
+ get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+ if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+ set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ endif()
+ add_custom_command(TARGET ompimp POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+ )
+endif()
+
diff --git a/final/runtime/cmake/LibompGetArchitecture.cmake b/final/runtime/cmake/LibompGetArchitecture.cmake
new file mode 100644
index 0000000..536b488
--- /dev/null
+++ b/final/runtime/cmake/LibompGetArchitecture.cmake
@@ -0,0 +1,70 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Determine the architecture from predefined compiler macros
+# The architecture name can only contain alphanumeric characters and underscores (i.e., C identifier)
+
+# void get_architecture(string* return_arch)
+# - Returns the architecture in return_arch
+function(libomp_get_architecture return_arch)
+ set(detect_arch_src_txt "
+ #if defined(__KNC__)
+ #error ARCHITECTURE=mic
+ #elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+ #error ARCHITECTURE=x86_64
+ #elif defined(__i386) || defined(__i386__) || defined(__IA32__) || defined(_M_I86) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+ #error ARCHITECTURE=i386
+ #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__)
+ #error ARCHITECTURE=arm
+ #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6ZK__)
+ #error ARCHITECTURE=arm
+ #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+ #error ARCHITECTURE=arm
+ #elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+ #error ARCHITECTURE=arm
+ #elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+ #error ARCHITECTURE=arm
+ #elif defined(__ARM_ARCH_2__)
+ #error ARCHITECTURE=arm
+ #elif defined(__arm__) || defined(_M_ARM) || defined(_ARM)
+ #error ARCHITECTURE=arm
+ #elif defined(__aarch64__)
+ #error ARCHITECTURE=aarch64
+ #elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+ #error ARCHITECTURE=ppc64le
+ #elif defined(__powerpc64__)
+ #error ARCHITECTURE=ppc64
+ #elif defined(__mips__) && defined(__mips64)
+ #error ARCHITECTURE=mips64
+ #elif defined(__mips__) && !defined(__mips64)
+ #error ARCHITECTURE=mips
+ #else
+ #error ARCHITECTURE=UnknownArchitecture
+ #endif
+ ")
+ # Write out ${detect_arch_src_txt} to a file within the cmake/ subdirectory
+ file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" ${detect_arch_src_txt})
+
+ # Try to compile using the C Compiler. It will always error out with an #error directive, so store error output to ${local_architecture}
+ try_run(run_dummy compile_dummy "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" COMPILE_OUTPUT_VARIABLE local_architecture)
+
+ # Match the important architecture line and store only that matching string in ${local_architecture}
+ string(REGEX MATCH "ARCHITECTURE=([a-zA-Z0-9_]+)" local_architecture "${local_architecture}")
+
+ # Get rid of the ARCHITECTURE= part of the string
+ string(REPLACE "ARCHITECTURE=" "" local_architecture "${local_architecture}")
+
+ # set the return value to the architecture detected (e.g., 32e, 32, arm, ppc64, etc.)
+ set(${return_arch} "${local_architecture}" PARENT_SCOPE)
+
+ # Remove ${detect_arch_src_txt} from cmake/ subdirectory
+ file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c")
+endfunction()
diff --git a/final/runtime/cmake/LibompHandleFlags.cmake b/final/runtime/cmake/LibompHandleFlags.cmake
new file mode 100644
index 0000000..c1ceb52
--- /dev/null
+++ b/final/runtime/cmake/LibompHandleFlags.cmake
@@ -0,0 +1,208 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Setup the flags correctly for cmake (covert to string)
+# Pretty them up (STRIP any beginning and trailing whitespace,
+# remove duplicates, remove empty entries)
+macro(libomp_setup_flags flags)
+ if(NOT "${${flags}}" STREQUAL "") # if flags are empty, don't do anything
+ set(flags_local)
+ list(REMOVE_DUPLICATES ${flags}) # remove duplicates
+ list(REMOVE_ITEM ${flags} "") # remove empty items
+ libomp_list_to_string("${${flags}}" flags_local)
+ string(STRIP "${flags_local}" flags_local)
+ set(${flags} "${flags_local}")
+ endif()
+endmacro()
+
+# Gets flags common to both the C and C++ compiler
+function(libomp_get_c_and_cxxflags_common flags)
+ set(flags_local)
+ libomp_append(flags_local -fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+ libomp_append(flags_local -fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
+ libomp_append(flags_local -Wno-sign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+ libomp_append(flags_local -Wno-unused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+ libomp_append(flags_local -Wno-unused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+ libomp_append(flags_local -Wno-unused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+ libomp_append(flags_local -Wno-unused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+ libomp_append(flags_local -Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
+ libomp_append(flags_local -Wno-covered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
+ libomp_append(flags_local -Wno-deprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+ libomp_append(flags_local -Wno-gnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+ libomp_append(flags_local -Wno-unknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+ libomp_append(flags_local -Wno-missing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
+ libomp_append(flags_local -Wno-missing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
+ libomp_append(flags_local -Wno-comment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+ libomp_append(flags_local -Wno-self-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+ libomp_append(flags_local -Wno-vla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
+ libomp_append(flags_local -Wno-format-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+ libomp_append(flags_local -Wstringop-overflow=0 LIBOMP_HAVE_WSTRINGOP_OVERFLOW_FLAG)
+ libomp_append(flags_local /GS LIBOMP_HAVE_GS_FLAG)
+ libomp_append(flags_local /EHsc LIBOMP_HAVE_EHSC_FLAG)
+ libomp_append(flags_local /Oy- LIBOMP_HAVE_OY__FLAG)
+ # Intel(R) C Compiler flags
+ libomp_append(flags_local /Qsafeseh LIBOMP_HAVE_QSAFESEH_FLAG)
+ libomp_append(flags_local -Qoption,cpp,--extended_float_types LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+ libomp_append(flags_local -Qlong_double LIBOMP_HAVE_LONG_DOUBLE_FLAG)
+ libomp_append(flags_local -Qdiag-disable:177 LIBOMP_HAVE_DIAG_DISABLE_177_FLAG)
+ if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+ libomp_append(flags_local -Qinline-min-size=1 LIBOMP_HAVE_INLINE_MIN_SIZE_FLAG)
+ endif()
+ # Architectural C and C++ flags
+ if(${IA32})
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ libomp_append(flags_local -m32 LIBOMP_HAVE_M32_FLAG)
+ endif()
+ libomp_append(flags_local /arch:SSE2 LIBOMP_HAVE_ARCH_SSE2_FLAG)
+ libomp_append(flags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+ libomp_append(flags_local -falign-stack=maintain-16-byte LIBOMP_HAVE_FALIGN_STACK_FLAG)
+ elseif(${MIC})
+ libomp_append(flags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+ libomp_append(flags_local -ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
+ libomp_append(flags_local "-opt-streaming-stores never" LIBOMP_HAVE_OPT_STREAMING_STORES_FLAG)
+ endif()
+ set(${flags} ${flags_local} PARENT_SCOPE)
+endfunction()
+
+# C compiler flags
+function(libomp_get_cflags cflags)
+ set(cflags_local)
+ libomp_get_c_and_cxxflags_common(cflags_local)
+ # flags only for the C Compiler
+ libomp_append(cflags_local /TP LIBOMP_HAVE_TP_FLAG)
+ libomp_append(cflags_local "-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+ set(cflags_local ${cflags_local} ${LIBOMP_CFLAGS})
+ libomp_setup_flags(cflags_local)
+ set(${cflags} ${cflags_local} PARENT_SCOPE)
+endfunction()
+
+# C++ compiler flags
+function(libomp_get_cxxflags cxxflags)
+ set(cxxflags_local)
+ libomp_get_c_and_cxxflags_common(cxxflags_local)
+ set(cxxflags_local ${cxxflags_local} ${LIBOMP_CXXFLAGS})
+ libomp_setup_flags(cxxflags_local)
+ set(${cxxflags} ${cxxflags_local} PARENT_SCOPE)
+endfunction()
+
+# Assembler flags
+function(libomp_get_asmflags asmflags)
+ set(asmflags_local)
+ libomp_append(asmflags_local "-x assembler-with-cpp" LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG)
+ # Architectural assembler flags
+ if(${IA32})
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ libomp_append(asmflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+ endif()
+ libomp_append(asmflags_local /safeseh LIBOMP_HAVE_SAFESEH_MASM_FLAG)
+ libomp_append(asmflags_local /coff LIBOMP_HAVE_COFF_MASM_FLAG)
+ elseif(${MIC})
+ libomp_append(asmflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+ endif()
+ set(asmflags_local ${asmflags_local} ${LIBOMP_ASMFLAGS})
+ libomp_setup_flags(asmflags_local)
+ set(${asmflags} ${asmflags_local} PARENT_SCOPE)
+endfunction()
+
+# Linker flags
+function(libomp_get_ldflags ldflags)
+ set(ldflags_local)
+ libomp_append(ldflags_local "${CMAKE_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_LIB_NAME}.def"
+ IF_DEFINED CMAKE_LINK_DEF_FILE_FLAG)
+ libomp_append(ldflags_local "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+ IF_DEFINED CMAKE_C_OSX_CURRENT_VERSION_FLAG)
+ libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+ IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
+ libomp_append(ldflags_local -Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
+ libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
+ libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+ libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
+ libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
+ libomp_append(ldflags_local -Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
+ libomp_append(ldflags_local -no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
+ libomp_append(ldflags_local -static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
+ libomp_append(ldflags_local /SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
+ # Architectural linker flags
+ if(${IA32})
+ if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ libomp_append(ldflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+ endif()
+ libomp_append(ldflags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+ elseif(${MIC})
+ libomp_append(ldflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+ libomp_append(ldflags_local -Wl,-x LIBOMP_HAVE_X_FLAG)
+ endif()
+ set(ldflags_local ${ldflags_local} ${LIBOMP_LDFLAGS})
+ libomp_setup_flags(ldflags_local)
+ set(${ldflags} ${ldflags_local} PARENT_SCOPE)
+endfunction()
+
+# Library flags
+function(libomp_get_libflags libflags)
+ set(libflags_local)
+ libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}")
+ libomp_append(libflags_local "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+ if(${IA32})
+ libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY)
+ endif()
+ IF(${CMAKE_SYSTEM_NAME} MATCHES "NetBSD")
+ libomp_append(libflags_local -lm)
+ ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "NetBSD")
+ set(libflags_local ${libflags_local} ${LIBOMP_LIBFLAGS})
+ libomp_setup_flags(libflags_local)
+ set(${libflags} ${libflags_local} PARENT_SCOPE)
+endfunction()
+
+# Fortran flags
+function(libomp_get_fflags fflags)
+ set(fflags_local)
+ if(${IA32})
+ libomp_append(fflags_local -m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+ endif()
+ set(fflags_local ${fflags_local} ${LIBOMP_FFLAGS})
+ libomp_setup_flags(fflags_local)
+ set(${fflags} ${fflags_local} PARENT_SCOPE)
+endfunction()
+
+# Perl generate-defs.pl flags (For Windows only)
+function(libomp_get_gdflags gdflags)
+ set(gdflags_local)
+ if(${IA32})
+ set(libomp_gdflag_arch arch_32)
+ elseif(${INTEL64})
+ set(libomp_gdflag_arch arch_32e)
+ else()
+ set(libomp_gdflag_arch arch_${LIBOMP_ARCH})
+ endif()
+ libomp_append(gdflags_local "-D ${libomp_gdflag_arch}")
+ libomp_append(gdflags_local "-D msvc_compat")
+ libomp_append(gdflags_local "-D norm" NORMAL_LIBRARY)
+ libomp_append(gdflags_local "-D prof" PROFILE_LIBRARY)
+ libomp_append(gdflags_local "-D stub" STUBS_LIBRARY)
+ libomp_append(gdflags_local "-D HAVE_QUAD" LIBOMP_USE_QUAD_PRECISION)
+ libomp_append(gdflags_local "-D USE_DEBUGGER" LIBOMP_USE_DEBUGGER)
+ if(${LIBOMP_OMP_VERSION} GREATER 50 OR ${LIBOMP_OMP_VERSION} EQUAL 50)
+ libomp_append(gdflags_local "-D OMP_50")
+ endif()
+ if(${LIBOMP_OMP_VERSION} GREATER 45 OR ${LIBOMP_OMP_VERSION} EQUAL 45)
+ libomp_append(gdflags_local "-D OMP_45")
+ endif()
+ if(${LIBOMP_OMP_VERSION} GREATER 40 OR ${LIBOMP_OMP_VERSION} EQUAL 40)
+ libomp_append(gdflags_local "-D OMP_40")
+ endif()
+ if(${LIBOMP_OMP_VERSION} GREATER 30 OR ${LIBOMP_OMP_VERSION} EQUAL 30)
+ libomp_append(gdflags_local "-D OMP_30")
+ endif()
+ if(${DEBUG_BUILD} OR ${RELWITHDEBINFO_BUILD})
+ libomp_append(gdflags_local "-D KMP_DEBUG")
+ endif()
+ set(${gdflags} ${gdflags_local} PARENT_SCOPE)
+endfunction()
diff --git a/final/runtime/cmake/LibompMicroTests.cmake b/final/runtime/cmake/LibompMicroTests.cmake
new file mode 100644
index 0000000..0918fdd
--- /dev/null
+++ b/final/runtime/cmake/LibompMicroTests.cmake
@@ -0,0 +1,228 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# The following micro-tests are small tests to perform on the library just created.
+# There are currently five micro-tests:
+# (1) test-touch
+# - Compile and run a small program using newly created libomp library
+# - Fails if test-touch.c does not compile or if test-touch.c does not run after compilation
+# - Program dependencies: gcc or g++, grep, bourne shell
+# - Available for all Unix,Mac,Windows builds. Not available on Intel(R) MIC Architecture builds.
+# (2) test-relo
+# - Tests dynamic libraries for position-dependent code (can not have any position dependent code)
+# - Fails if TEXTREL is in output of readelf -d libomp.so command
+# - Program dependencies: readelf, grep, bourne shell
+# - Available for Unix, Intel(R) MIC Architecture dynamic library builds. Not available otherwise.
+# (3) test-execstack
+# - Tests if stack is executable
+# - Fails if stack is executable. Should only be readable and writable. Not exectuable.
+# - Program dependencies: perl, readelf
+# - Available for Unix dynamic library builds. Not available otherwise.
+# (4) test-instr (Intel(R) MIC Architecutre only)
+# - Tests Intel(R) MIC Architecture libraries for valid instruction set
+# - Fails if finds invalid instruction for Intel(R) MIC Architecture (wasn't compiled with correct flags)
+# - Program dependencies: perl, objdump
+# - Available for Intel(R) MIC Architecture and i386 builds. Not available otherwise.
+# (5) test-deps
+# - Tests newly created libomp for library dependencies
+# - Fails if sees a dependence not listed in td_exp variable below
+# - Program dependencies: perl, (unix)readelf, (mac)otool[64], (windows)link.exe
+# - Available for Unix,Mac,Windows, Intel(R) MIC Architecture dynamic builds and Windows
+# static builds. Not available otherwise.
+
+# get library location
+if(WIN32)
+ get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+ get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+ if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+ set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ endif()
+else()
+ get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+ set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# test-touch
+find_program(LIBOMP_SHELL sh)
+if(WIN32)
+ if(LIBOMP_SHELL)
+ set(libomp_test_touch_targets test-touch-md/.success test-touch-mt/.success)
+ endif()
+ # pick test-touch compiler
+ set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+ # test-touch compilation flags
+ libomp_append(libomp_test_touch_cflags /nologo)
+ libomp_append(libomp_test_touch_libs ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE})
+ if(${IA32})
+ libomp_append(libomp_test_touch_ldflags /safeseh)
+ endif()
+else() # (Unix based systems, Intel(R) MIC Architecture, and Mac)
+ if(LIBOMP_SHELL)
+ set(libomp_test_touch_targets test-touch-rt/.success)
+ endif()
+ # pick test-touch compiler
+ if(${LIBOMP_USE_STDCPPLIB})
+ set(libomp_test_touch_compiler ${CMAKE_CXX_COMPILER})
+ else()
+ set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+ endif()
+ # test-touch compilation flags
+ libomp_append(libomp_test_touch_libs "${CMAKE_THREAD_LIBS_INIT}")
+ if(${IA32})
+ libomp_append(libomp_test_touch_cflags -m32 LIBOMP_HAVE_M32_FLAG)
+ endif()
+ libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE})
+ libomp_append(libomp_test_touch_libs "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+ if(APPLE)
+ set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}")
+ libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+ else()
+ set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}")
+ libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+ endif()
+endif()
+macro(libomp_test_touch_recipe test_touch_dir)
+ set(libomp_test_touch_dependencies ${LIBOMP_SRC_DIR}/test-touch.c omp)
+ set(libomp_test_touch_exe ${test_touch_dir}/test-touch${CMAKE_EXECUTABLE_SUFFIX})
+ set(libomp_test_touch_obj ${test_touch_dir}/test-touch${CMAKE_C_OUTPUT_EXTENSION})
+ if(WIN32)
+ if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+ if(${test_touch_dir} MATCHES "test-touch-mt")
+ libomp_append(libomp_test_touch_cflags /MT)
+ else()
+ libomp_append(libomp_test_touch_cflags /MD)
+ endif()
+ else()
+ if(${test_touch_dir} MATCHES "test-touch-mt")
+ libomp_append(libomp_test_touch_cflags /MTd)
+ else()
+ libomp_append(libomp_test_touch_cflags /MDd)
+ endif()
+ endif()
+ set(libomp_test_touch_out_flags -Fe${libomp_test_touch_exe} -Fo${libomp_test_touch_obj})
+ list(APPEND libomp_test_touch_dependencies ompimp)
+ else()
+ set(libomp_test_touch_out_flags -o ${libomp_test_touch_exe})
+ endif()
+ add_custom_command(
+ OUTPUT ${test_touch_dir}/.success ${libomp_test_touch_exe} ${libomp_test_touch_obj}
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${test_touch_dir}
+ COMMAND ${CMAKE_COMMAND} -E remove -f ${test_touch_dir}/*
+ COMMAND ${libomp_test_touch_compiler} ${libomp_test_touch_out_flags} ${libomp_test_touch_cflags}
+ ${LIBOMP_SRC_DIR}/test-touch.c ${libomp_test_touch_ldflags} ${libomp_test_touch_libs}
+ COMMAND ${LIBOMP_SHELL} -c \"${libomp_test_touch_env} ${libomp_test_touch_exe}\"
+ COMMAND ${CMAKE_COMMAND} -E touch ${test_touch_dir}/.success
+ DEPENDS ${libomp_test_touch_dependencies}
+ )
+endmacro()
+libomp_append(libomp_test_touch_env "KMP_VERSION=1")
+add_custom_target(libomp-test-touch DEPENDS ${libomp_test_touch_targets})
+if(WIN32)
+ libomp_test_touch_recipe(test-touch-mt)
+ libomp_test_touch_recipe(test-touch-md)
+else()
+ libomp_test_touch_recipe(test-touch-rt)
+endif()
+
+# test-relo
+add_custom_target(libomp-test-relo DEPENDS test-relo/.success)
+add_custom_command(
+ OUTPUT test-relo/.success test-relo/readelf.log
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-relo
+ COMMAND readelf -d ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} > test-relo/readelf.log
+ COMMAND grep -e TEXTREL test-relo/readelf.log \; test $$? -eq 1
+ COMMAND ${CMAKE_COMMAND} -E touch test-relo/.success
+ DEPENDS omp
+)
+
+# test-execstack
+add_custom_target(libomp-test-execstack DEPENDS test-execstack/.success)
+add_custom_command(
+ OUTPUT test-execstack/.success
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-execstack
+ COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-execstack.pl
+ --arch=${LIBOMP_PERL_SCRIPT_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+ COMMAND ${CMAKE_COMMAND} -E touch test-execstack/.success
+ DEPENDS omp
+)
+
+# test-instr
+add_custom_target(libomp-test-instr DEPENDS test-instr/.success)
+add_custom_command(
+ OUTPUT test-instr/.success
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-instr
+ COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+ --arch=${LIBOMP_PERL_SCRIPT_ARCH} --show --mic-arch=${LIBOMP_MIC_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+ COMMAND ${CMAKE_COMMAND} -E touch test-instr/.success
+ DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl
+)
+
+# test-deps
+add_custom_target(libomp-test-deps DEPENDS test-deps/.success)
+set(libomp_expected_library_deps)
+if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+ set(libomp_expected_library_deps libc.so.7 libthr.so.3)
+ libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+ set(libomp_expected_library_deps libc.so.12 libpthread.so.1 libm.so.0)
+ libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+elseif(APPLE)
+ set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib)
+elseif(WIN32)
+ set(libomp_expected_library_deps kernel32.dll)
+ libomp_append(libomp_expected_library_deps psapi.dll LIBOMP_OMPT_SUPPORT)
+else()
+ if(${MIC})
+ set(libomp_expected_library_deps libc.so.6 libpthread.so.0 libdl.so.2)
+ if("${LIBOMP_MIC_ARCH}" STREQUAL "knf")
+ libomp_append(libomp_expected_library_deps ld-linux-l1om.so.2)
+ libomp_append(libomp_expected_library_deps libgcc_s.so.1)
+ elseif("${LIBOMP_MIC_ARCH}" STREQUAL "knc")
+ libomp_append(libomp_expected_library_deps ld-linux-k1om.so.2)
+ endif()
+ else()
+ set(libomp_expected_library_deps libdl.so.2 libgcc_s.so.1)
+ if(${IA32})
+ libomp_append(libomp_expected_library_deps libc.so.6)
+ libomp_append(libomp_expected_library_deps ld-linux.so.2)
+ elseif(${INTEL64})
+ libomp_append(libomp_expected_library_deps libc.so.6)
+ libomp_append(libomp_expected_library_deps ld-linux-x86-64.so.2)
+ elseif(${ARM})
+ libomp_append(libomp_expected_library_deps libc.so.6)
+ libomp_append(libomp_expected_library_deps libffi.so.6)
+ libomp_append(libomp_expected_library_deps libffi.so.5)
+ libomp_append(libomp_expected_library_deps ld-linux-armhf.so.3)
+ elseif(${PPC64})
+ libomp_append(libomp_expected_library_deps libc.so.6)
+ libomp_append(libomp_expected_library_deps ld64.so.1)
+ elseif(${MIPS} OR ${MIPS64})
+ libomp_append(libomp_expected_library_deps libc.so.6)
+ libomp_append(libomp_expected_library_deps ld.so.1)
+ endif()
+ libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
+ libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+ endif()
+ libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB)
+ libomp_append(libomp_expected_library_deps libm.so.6 LIBOMP_STATS)
+endif()
+# Perl script expects comma separated list
+string(REPLACE ";" "," libomp_expected_library_deps "${libomp_expected_library_deps}")
+add_custom_command(
+ OUTPUT test-deps/.success
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-deps
+ COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-depends.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+ --arch=${LIBOMP_PERL_SCRIPT_ARCH} --expected="${libomp_expected_library_deps}" ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+ COMMAND ${CMAKE_COMMAND} -E touch test-deps/.success
+ DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-depends.pl
+)
diff --git a/final/runtime/cmake/LibompUtils.cmake b/final/runtime/cmake/LibompUtils.cmake
new file mode 100644
index 0000000..f4cfa8c
--- /dev/null
+++ b/final/runtime/cmake/LibompUtils.cmake
@@ -0,0 +1,195 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomp_say(string message_to_user);
+# - prints out message_to_user
+macro(libomp_say message_to_user)
+ message(STATUS "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomp_warning_say message_to_user)
+ message(WARNING "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomp_error_say message_to_user)
+ message(FATAL_ERROR "LIBOMP: ${message_to_user}")
+endmacro()
+
+# libomp_append(<flag> <flags_list> [(IF_TRUE | IF_FALSE | IF_TRUE_1_0 ) BOOLEAN])
+#
+# libomp_append(<flag> <flags_list>)
+# - unconditionally appends <flag> to the list of definitions
+#
+# libomp_append(<flag> <flags_list> <BOOLEAN>)
+# - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_TRUE <BOOLEAN>)
+# - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_FALSE <BOOLEAN>)
+# - appends <flag> to the list of definitions if BOOLEAN is false
+#
+# libomp_append(<flag> <flags_list> IF_DEFINED <VARIABLE>)
+# - appends <flag> to the list of definitions if VARIABLE is defined
+#
+# libomp_append(<flag> <flags_list> IF_TRUE_1_0 <BOOLEAN>)
+# - appends <flag>=1 to the list of definitions if <BOOLEAN> is true, <flag>=0 otherwise
+# e.g., libomp_append("-D USE_FEATURE" IF_TRUE_1_0 HAVE_FEATURE)
+# appends "-D USE_FEATURE=1" if HAVE_FEATURE is true
+# or "-D USE_FEATURE=0" if HAVE_FEATURE is false
+macro(libomp_append flags flag)
+ if(NOT (${ARGC} EQUAL 2 OR ${ARGC} EQUAL 3 OR ${ARGC} EQUAL 4))
+ libomp_error_say("libomp_append: takes 2, 3, or 4 arguments")
+ endif()
+ if(${ARGC} EQUAL 2)
+ list(APPEND ${flags} "${flag}")
+ elseif(${ARGC} EQUAL 3)
+ if(${ARGV2})
+ list(APPEND ${flags} "${flag}")
+ endif()
+ else()
+ if(${ARGV2} STREQUAL "IF_TRUE")
+ if(${ARGV3})
+ list(APPEND ${flags} "${flag}")
+ endif()
+ elseif(${ARGV2} STREQUAL "IF_FALSE")
+ if(NOT ${ARGV3})
+ list(APPEND ${flags} "${flag}")
+ endif()
+ elseif(${ARGV2} STREQUAL "IF_DEFINED")
+ if(DEFINED ${ARGV3})
+ list(APPEND ${flags} "${flag}")
+ endif()
+ elseif(${ARGV2} STREQUAL "IF_TRUE_1_0")
+ if(${ARGV3})
+ list(APPEND ${flags} "${flag}=1")
+ else()
+ list(APPEND ${flags} "${flag}=0")
+ endif()
+ else()
+ libomp_error_say("libomp_append: third argument must be one of IF_TRUE, IF_FALSE, IF_DEFINED, IF_TRUE_1_0")
+ endif()
+ endif()
+endmacro()
+
+# void libomp_get_legal_arch(string* return_arch_string);
+# - returns (through return_arch_string) the formal architecture
+# string or warns user of unknown architecture
+function(libomp_get_legal_arch return_arch_string)
+ if(${IA32})
+ set(${return_arch_string} "IA-32" PARENT_SCOPE)
+ elseif(${INTEL64})
+ set(${return_arch_string} "Intel(R) 64" PARENT_SCOPE)
+ elseif(${MIC})
+ set(${return_arch_string} "Intel(R) Many Integrated Core Architecture" PARENT_SCOPE)
+ elseif(${ARM})
+ set(${return_arch_string} "ARM" PARENT_SCOPE)
+ elseif(${PPC64BE})
+ set(${return_arch_string} "PPC64BE" PARENT_SCOPE)
+ elseif(${PPC64LE})
+ set(${return_arch_string} "PPC64LE" PARENT_SCOPE)
+ elseif(${AARCH64})
+ set(${return_arch_string} "AARCH64" PARENT_SCOPE)
+ elseif(${MIPS})
+ set(${return_arch_string} "MIPS" PARENT_SCOPE)
+ elseif(${MIPS64})
+ set(${return_arch_string} "MIPS64" PARENT_SCOPE)
+ else()
+ set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE)
+ libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}")
+ endif()
+endfunction()
+
+# void libomp_check_variable(string var, ...);
+# - runs through all values checking if ${var} == value
+# - uppercase and lowercase do not matter
+# - if the var is found, then just print it out
+# - if the var is not found, then error out
+function(libomp_check_variable var)
+ set(valid_flag 0)
+ string(TOLOWER "${${var}}" var_lower)
+ foreach(value IN LISTS ARGN)
+ string(TOLOWER "${value}" value_lower)
+ if("${var_lower}" STREQUAL "${value_lower}")
+ set(valid_flag 1)
+ set(the_value "${value}")
+ endif()
+ endforeach()
+ if(${valid_flag} EQUAL 0)
+ libomp_error_say("libomp_check_variable(): ${var} = ${${var}} is unknown")
+ endif()
+endfunction()
+
+# void libomp_get_build_number(string src_dir, string* return_build_number);
+# - grab the eight digit build number (or 00000000) from kmp_version.cpp
+function(libomp_get_build_number src_dir return_build_number)
+ # sets file_lines_list to a list of all lines in kmp_version.cpp
+ file(STRINGS "${src_dir}/src/kmp_version.cpp" file_lines_list)
+
+ # runs through each line in kmp_version.cpp
+ foreach(line IN LISTS file_lines_list)
+ # if the line begins with "#define KMP_VERSION_BUILD" then we take not of the build number
+ string(REGEX MATCH "^[ \t]*#define[ \t]+KMP_VERSION_BUILD" valid "${line}")
+ if(NOT "${valid}" STREQUAL "") # if we matched "#define KMP_VERSION_BUILD", then grab the build number
+ string(REGEX REPLACE "^[ \t]*#define[ \t]+KMP_VERSION_BUILD[ \t]+([0-9]+)" "\\1"
+ build_number "${line}"
+ )
+ endif()
+ endforeach()
+ set(${return_build_number} "${build_number}" PARENT_SCOPE) # return build number
+endfunction()
+
+# void libomp_get_legal_type(string* return_legal_type);
+# - set the legal type name Performance/Profiling/Stub
+function(libomp_get_legal_type return_legal_type)
+ if(${NORMAL_LIBRARY})
+ set(${return_legal_type} "Performance" PARENT_SCOPE)
+ elseif(${PROFILE_LIBRARY})
+ set(${return_legal_type} "Profiling" PARENT_SCOPE)
+ elseif(${STUBS_LIBRARY})
+ set(${return_legal_type} "Stub" PARENT_SCOPE)
+ endif()
+endfunction()
+
+# void libomp_add_suffix(string suffix, list<string>* list_of_items);
+# - returns list_of_items with suffix appended to all items
+# - original list is modified
+function(libomp_add_suffix suffix list_of_items)
+ set(local_list "")
+ foreach(item IN LISTS "${list_of_items}")
+ if(NOT "${item}" STREQUAL "")
+ list(APPEND local_list "${item}${suffix}")
+ endif()
+ endforeach()
+ set(${list_of_items} "${local_list}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_list_to_string(list<string> list_of_things, string* return_string);
+# - converts a list to a space separated string
+function(libomp_list_to_string list_of_things return_string)
+ string(REPLACE ";" " " output_variable "${list_of_things}")
+ set(${return_string} "${output_variable}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_string_to_list(string str, list<string>* return_list);
+# - converts a string to a semicolon separated list
+# - what it really does is just string_replace all running whitespace to a semicolon
+# - in cmake, a list is strings separated by semicolons: i.e., list of four items, list = "item1;item2;item3;item4"
+function(libomp_string_to_list str return_list)
+ set(outstr)
+ string(REGEX REPLACE "[ \t]+" ";" outstr "${str}")
+ set(${return_list} "${outstr}" PARENT_SCOPE)
+endfunction()
+
diff --git a/final/runtime/cmake/config-ix.cmake b/final/runtime/cmake/config-ix.cmake
new file mode 100644
index 0000000..b0e70d4
--- /dev/null
+++ b/final/runtime/cmake/config-ix.cmake
@@ -0,0 +1,281 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+include(CheckCCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckCXXCompilerFlag)
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+include(CheckIncludeFiles)
+include(LibompCheckLinkerFlag)
+include(LibompCheckFortranFlag)
+
+# Check for versioned symbols
+function(libomp_check_version_symbols retval)
+ set(source_code
+ "#include <stdio.h>
+ void func1() { printf(\"Hello\"); }
+ void func2() { printf(\"World\"); }
+ __asm__(\".symver func1, func@VER1\");
+ __asm__(\".symver func2, func@VER2\");
+ int main() {
+ func1();
+ func2();
+ return 0;
+ }")
+ set(version_script_source "VER1 { }; VER2 { } VER1;")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt "${version_script_source}")
+ set(CMAKE_REQUIRED_FLAGS -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+ check_c_source_compiles("${source_code}" ${retval})
+ set(${retval} ${${retval}} PARENT_SCOPE)
+ file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+endfunction()
+
+# Includes the architecture flag in both compile and link phase
+function(libomp_check_architecture_flag flag retval)
+ set(CMAKE_REQUIRED_FLAGS "${flag}")
+ check_c_compiler_flag("${flag}" ${retval})
+ set(${retval} ${${retval}} PARENT_SCOPE)
+endfunction()
+
+# Checking C, CXX, Linker Flags
+check_cxx_compiler_flag(-fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+check_cxx_compiler_flag(-fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
+check_c_compiler_flag("-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+check_c_compiler_flag(-Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+check_c_compiler_flag(-Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+check_c_compiler_flag(-Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+check_c_compiler