aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGil Pitney <gil.pitney@linaro.org>2014-10-28 18:00:42 -0700
committerGil Pitney <gil.pitney@linaro.org>2014-10-28 18:00:42 -0700
commit61b2c94d9e64758e55730be6a3fc9006c171db85 (patch)
treef564f09ebf93ba293dfa225bd374df6f1f37aa01 /src
Initial Commit: Based on TI OpenCL v0.8, originally based on clover.shamrock_v0.8
This is a continuation of the clover OpenCL project: http://people.freedesktop.org/~steckdenis/clover based on the contributions from Texas Instruments for Keystone II DSP device: git.ti.com/opencl and adding contributions from Linaro for ARM CPU-only support. See README.txt for more info, and build instructions. Signed-off-by: Gil Pitney <gil.pitney@linaro.org>
Diffstat (limited to 'src')
-rw-r--r--src/.gitignore3
-rw-r--r--src/CMakeLists.txt241
-rw-r--r--src/api/api_command.cpp130
-rw-r--r--src/api/api_context.cpp149
-rw-r--r--src/api/api_device.cpp78
-rw-r--r--src/api/api_enqueue.cpp823
-rw-r--r--src/api/api_event.cpp190
-rw-r--r--src/api/api_flush.cpp57
-rw-r--r--src/api/api_gl.cpp118
-rw-r--r--src/api/api_kernel.cpp219
-rw-r--r--src/api/api_memory.cpp418
-rw-r--r--src/api/api_platform.cpp89
-rw-r--r--src/api/api_profiling.cpp50
-rw-r--r--src/api/api_program.cpp307
-rw-r--r--src/api/api_sampler.cpp109
-rw-r--r--src/builtins/CMakeLists.txt33
-rw-r--r--src/builtins/Makefile24
-rw-r--r--src/builtins/README.txt13
-rw-r--r--src/builtins/abs.cl33
-rw-r--r--src/builtins/abs_diff.cl72
-rw-r--r--src/builtins/add_sat.cl37
-rw-r--r--src/builtins/all.cl43
-rw-r--r--src/builtins/any.cl43
-rw-r--r--src/builtins/atomics.cl.broken558
-rw-r--r--src/builtins/bitselect.cl92
-rw-r--r--src/builtins/clamp.cl43
-rw-r--r--src/builtins/clz.cl37
-rw-r--r--src/builtins/convert.cl36122
-rw-r--r--src/builtins/cross.cl59
-rw-r--r--src/builtins/degrees.cl41
-rw-r--r--src/builtins/dot.cl41
-rw-r--r--src/builtins/fract.cl93
-rw-r--r--src/builtins/frexp.cl76
-rw-r--r--src/builtins/hadd.cl44
-rw-r--r--src/builtins/length.cl109
-rw-r--r--src/builtins/lgamma_r.cl80
-rw-r--r--src/builtins/mad_sat.cl37
-rw-r--r--src/builtins/math.cl151
-rw-r--r--src/builtins/max.cl46
-rw-r--r--src/builtins/misc.cl36
-rw-r--r--src/builtins/mix.cl42
-rw-r--r--src/builtins/modf.cl81
-rw-r--r--src/builtins/mul_hi.cl102
-rw-r--r--src/builtins/relationals.cl64
-rw-r--r--src/builtins/remquo.cl127
-rw-r--r--src/builtins/rotate.cl58
-rw-r--r--src/builtins/select.cl53
-rw-r--r--src/builtins/shuffle.cl215
-rw-r--r--src/builtins/sign.cl43
-rw-r--r--src/builtins/sincos.cl128
-rw-r--r--src/builtins/smoothstep.cl77
-rw-r--r--src/builtins/step.cl43
-rw-r--r--src/builtins/sub_sat.cl37
-rw-r--r--src/builtins/upsample.cl56
-rw-r--r--src/builtins/vload.cl127
-rw-r--r--src/core/commandqueue.cpp1018
-rw-r--r--src/core/commandqueue.h494
-rw-r--r--src/core/compiler.cpp342
-rw-r--r--src/core/compiler.h138
-rw-r--r--src/core/config.h9
-rw-r--r--src/core/config.h.cmake9
-rw-r--r--src/core/context.cpp236
-rw-r--r--src/core/context.h104
-rw-r--r--src/core/cpu/buffer.cpp128
-rw-r--r--src/core/cpu/buffer.h77
-rw-r--r--src/core/cpu/builtins.cpp503
-rw-r--r--src/core/cpu/builtins.h144
-rw-r--r--src/core/cpu/device.cpp675
-rw-r--r--src/core/cpu/device.h113
-rw-r--r--src/core/cpu/kernel.cpp734
-rw-r--r--src/core/cpu/kernel.h325
-rw-r--r--src/core/cpu/program.cpp174
-rw-r--r--src/core/cpu/program.h102
-rw-r--r--src/core/cpu/sampler.cpp769
-rw-r--r--src/core/cpu/worker.cpp274
-rw-r--r--src/core/cpu/worker.h45
-rw-r--r--src/core/deviceinterface.h352
-rw-r--r--src/core/dsp/buffer.cpp149
-rw-r--r--src/core/dsp/buffer.h61
-rw-r--r--src/core/dsp/cmem.cpp271
-rw-r--r--src/core/dsp/cmem.h64
-rw-r--r--src/core/dsp/core_scheduler.h62
-rw-r--r--src/core/dsp/database.h112
-rw-r--r--src/core/dsp/device.cpp1135
-rw-r--r--src/core/dsp/device.h151
-rw-r--r--src/core/dsp/driver.cpp34
-rw-r--r--src/core/dsp/driver.h100
-rw-r--r--src/core/dsp/driver_hawking.cpp451
-rw-r--r--src/core/dsp/driver_shannon.cpp313
-rw-r--r--src/core/dsp/dspheap.h200
-rw-r--r--src/core/dsp/dspmem.h59
-rw-r--r--src/core/dsp/genfile_cache.cpp94
-rw-r--r--src/core/dsp/genfile_cache.h101
-rw-r--r--src/core/dsp/kernel.cpp718
-rw-r--r--src/core/dsp/kernel.h119
-rw-r--r--src/core/dsp/mailbox.h114
-rw-r--r--src/core/dsp/memmap.h120
-rw-r--r--src/core/dsp/message.h115
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c200
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h53
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h160
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c1101
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h30
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp825
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h101
-rw-r--r--src/core/dsp/ocl_load/CMakeLists.txt26
-rw-r--r--src/core/dsp/ocl_load/DLOAD/ArrayList.c122
-rw-r--r--src/core/dsp/ocl_load/DLOAD/ArrayList.h92
-rw-r--r--src/core/dsp/ocl_load/DLOAD/Queue.h194
-rw-r--r--src/core/dsp/ocl_load/DLOAD/Stack.h155
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload.c3534
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload.h334
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload_endian.c151
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload_endian.h58
-rw-r--r--src/core/dsp/ocl_load/DLOAD/elf32.c652
-rw-r--r--src/core/dsp/ocl_load/DLOAD/elf32.h756
-rw-r--r--src/core/dsp/ocl_load/DLOAD/relocate.h64
-rw-r--r--src/core/dsp/ocl_load/DLOAD/symtab.h72
-rw-r--r--src/core/dsp/ocl_load/DLOAD/util.h89
-rw-r--r--src/core/dsp/ocl_load/DLOAD/version.h63
-rw-r--r--src/core/dsp/ocl_load/DLOAD/virtual_targets.h90
-rw-r--r--src/core/dsp/ocl_load/DLOAD_API/api_version_change.log33
-rw-r--r--src/core/dsp/ocl_load/DLOAD_API/dload_api.h700
-rw-r--r--src/core/dsp/ocl_load/DLOAD_SYM/symtab.c417
-rw-r--r--src/core/dsp/ocl_load/README8
-rw-r--r--src/core/dsp/ocl_load/Stack.h182
-rw-r--r--src/core/dsp/ocl_load/ocl_load.c139
-rw-r--r--src/core/dsp/program.cpp633
-rw-r--r--src/core/dsp/program.h92
-rw-r--r--src/core/dsp/shmem.cpp539
-rw-r--r--src/core/dsp/shmem.h134
-rw-r--r--src/core/dsp/source_cache.h114
-rw-r--r--src/core/dsp/u_concurrent_map.h137
-rw-r--r--src/core/dsp/u_concurrent_stack.h124
-rw-r--r--src/core/dsp/u_lockable.h109
-rw-r--r--src/core/dsp/u_locks_pthread.h137
-rw-r--r--src/core/dsp/utils.h85
-rw-r--r--src/core/dsp/wga.cpp464
-rw-r--r--src/core/dsp/wga.h72
-rw-r--r--src/core/dsp/worker.cpp519
-rw-r--r--src/core/events.cpp1519
-rw-r--r--src/core/events.h718
-rw-r--r--src/core/icd.cpp145
-rw-r--r--src/core/icd.h44
-rw-r--r--src/core/kernel.cpp637
-rw-r--r--src/core/kernel.h326
-rw-r--r--src/core/memobject.cpp960
-rw-r--r--src/core/memobject.h302
-rw-r--r--src/core/object.cpp115
-rw-r--r--src/core/object.h133
-rw-r--r--src/core/platform.cpp227
-rw-r--r--src/core/platform.h65
-rw-r--r--src/core/program.cpp846
-rw-r--r--src/core/program.h250
-rw-r--r--src/core/propertylist.h119
-rw-r--r--src/core/sampler.cpp247
-rw-r--r--src/core/sampler.h115
-rw-r--r--src/core/util.cpp68
-rw-r--r--src/core/util.h41
-rw-r--r--src/llvmopencl/AllocasToEntry.cc74
-rw-r--r--src/llvmopencl/AllocasToEntry.h49
-rw-r--r--src/llvmopencl/Barrier.h121
-rw-r--r--src/llvmopencl/BarrierBlock.cc73
-rw-r--r--src/llvmopencl/BarrierBlock.h44
-rw-r--r--src/llvmopencl/BarrierTailReplication.cc421
-rw-r--r--src/llvmopencl/BarrierTailReplication.h85
-rw-r--r--src/llvmopencl/BreakConstantGEPs.cpp326
-rw-r--r--src/llvmopencl/BreakConstantGEPs.h57
-rw-r--r--src/llvmopencl/CanonicalizeBarriers.cc214
-rw-r--r--src/llvmopencl/CanonicalizeBarriers.h56
-rw-r--r--src/llvmopencl/Flatten.cc158
-rw-r--r--src/llvmopencl/Flatten.h51
-rw-r--r--src/llvmopencl/GenerateHeader.cc336
-rw-r--r--src/llvmopencl/ImplicitLoopBarriers.cc178
-rw-r--r--src/llvmopencl/ImplicitLoopBarriers.h44
-rw-r--r--src/llvmopencl/IsolateRegions.cc175
-rw-r--r--src/llvmopencl/IsolateRegions.h44
-rw-r--r--src/llvmopencl/Kernel.cc297
-rw-r--r--src/llvmopencl/Kernel.h54
-rw-r--r--src/llvmopencl/LLVMUtils.cc90
-rw-r--r--src/llvmopencl/LLVMUtils.h38
-rw-r--r--src/llvmopencl/LoopBarriers.cc194
-rw-r--r--src/llvmopencl/LoopBarriers.h47
-rw-r--r--src/llvmopencl/Makefile.am53
-rw-r--r--src/llvmopencl/Makefile.in822
-rw-r--r--src/llvmopencl/PHIsToAllocas.cc144
-rw-r--r--src/llvmopencl/PHIsToAllocas.h56
-rw-r--r--src/llvmopencl/ParallelRegion.cc809
-rw-r--r--src/llvmopencl/ParallelRegion.h127
-rw-r--r--src/llvmopencl/TargetAddressSpaces.cc220
-rw-r--r--src/llvmopencl/TargetAddressSpaces.h54
-rw-r--r--src/llvmopencl/VariableUniformityAnalysis.cc382
-rw-r--r--src/llvmopencl/VariableUniformityAnalysis.h70
-rw-r--r--src/llvmopencl/WIVectorize.cc3252
-rw-r--r--src/llvmopencl/WorkItemAliasAnalysis.cc119
-rw-r--r--src/llvmopencl/WorkItemAliasAnalysis.h75
-rw-r--r--src/llvmopencl/Workgroup.cc619
-rw-r--r--src/llvmopencl/Workgroup.h48
-rw-r--r--src/llvmopencl/WorkitemHandler.cc278
-rw-r--r--src/llvmopencl/WorkitemHandler.h73
-rw-r--r--src/llvmopencl/WorkitemHandlerChooser.cc111
-rw-r--r--src/llvmopencl/WorkitemHandlerChooser.h52
-rw-r--r--src/llvmopencl/WorkitemLoops.cc1061
-rw-r--r--src/llvmopencl/WorkitemLoops.h112
-rw-r--r--src/llvmopencl/WorkitemReplication.cc308
-rw-r--r--src/llvmopencl/WorkitemReplication.h62
-rw-r--r--src/llvmopencl/config.h1
-rw-r--r--src/llvmopencl/pocl.h49
-rw-r--r--src/runtime/CMakeLists.txt59
-rw-r--r--src/runtime/builtins.def301
-rwxr-xr-xsrc/runtime/builtins.py380
-rwxr-xr-xsrc/runtime/embed.py76
-rw-r--r--src/runtime/stdlib.c40
213 files changed, 86981 insertions, 0 deletions
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..ec5d309
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+CMakeFiles/
+cmake_install.cmake
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..7b60902
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,241 @@
+
+if (SHANNON_BUILD)
+ SET (TARGET_INCLUDES
+ ${PROJECT_SOURCE_DIR}/init
+ ${SDK}/sdk
+ ${SDK}/sdk/config
+ ${SDK}/sdk/pciedrv
+ ${SDK}/sdk/cmem
+ ${SDK}/sdk/bufmgr
+ ${SDK}/sdk/mailBox
+ ${SDK}/sdk/dnldmgr )
+# Cross-compiling needs additional paths to find target OS headers
+# and non-system headers found on the host (BOOST,GL)
+elseif (HAWKING_CROSS_COMPILE)
+ SET (TARGET_INCLUDES
+ ${CMAKE_FIND_ROOT_PATH}
+ ${HOST_USR_INCLUDE_PATH} )
+endif()
+
+
+include_directories (
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/src
+ ${LLVM_INCLUDE_DIR}
+ ${CLANG_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD_API
+ ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD
+ ${PROJECT_SOURCE_DIR}/src/llvmopencl
+ ${TARGET_INCLUDES}
+ )
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FILE_OFFSET_BITS=64")
+
+# bfd.h has a check to ensure that config.h is included
+# We don't require config.h (autotools) so we bypass this check by defining
+# PACKAGE, and PACKAGE_VERSION
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPACKAGE=${PROJECT_NAME} -DPACKAGE_VERSION=${${PROJECT_NAME}_VERSION}")
+
+# Toggle below if wanting to build with debug
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-inline -g")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fno-inline -g")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+
+# Temporary to work around hyperlink problem
+set(CMAKE_C_FLAGS "${CMAKE_CFLAGS} -D__ARMv7 -DDEVICE_K2H")
+
+if (SHANNON_BUILD)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDSPC868X")
+endif()
+
+configure_file(core/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/core/config.h)
+
+set(COAL_SRC_FILES
+ api/api_command.cpp
+ api/api_device.cpp
+ api/api_event.cpp
+ api/api_kernel.cpp
+ api/api_platform.cpp
+ api/api_program.cpp
+ api/api_context.cpp
+ api/api_enqueue.cpp
+ api/api_flush.cpp
+ api/api_memory.cpp
+ api/api_profiling.cpp
+ api/api_sampler.cpp
+ api/api_gl.cpp
+
+ core/context.cpp
+ core/commandqueue.cpp
+ core/memobject.cpp
+ core/events.cpp
+ core/program.cpp
+ core/compiler.cpp
+ core/kernel.cpp
+ core/sampler.cpp
+ core/object.cpp
+ core/platform.cpp
+ core/icd.cpp
+ core/util.cpp
+
+ core/cpu/buffer.cpp
+ core/cpu/device.cpp
+ core/cpu/kernel.cpp
+ core/cpu/program.cpp
+ core/cpu/worker.cpp
+ core/cpu/builtins.cpp
+ core/cpu/sampler.cpp
+
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h
+)
+
+if (NOT SHAMROCK_BUILD)
+list (APPEND COAL_SRC_FILES
+ core/dsp/genfile_cache.cpp
+ core/dsp/program.cpp
+ core/dsp/wga.cpp
+ core/dsp/driver.cpp
+ core/dsp/buffer.cpp
+ core/dsp/device.cpp
+ core/dsp/kernel.cpp
+ core/dsp/worker.cpp
+
+ llvmopencl/AllocasToEntry.cc
+ llvmopencl/BarrierBlock.cc
+ llvmopencl/BarrierTailReplication.cc
+ llvmopencl/BreakConstantGEPs.cpp
+ llvmopencl/CanonicalizeBarriers.cc
+ llvmopencl/Flatten.cc
+ llvmopencl/GenerateHeader.cc
+ llvmopencl/ImplicitLoopBarriers.cc
+ llvmopencl/IsolateRegions.cc
+ llvmopencl/Kernel.cc
+ llvmopencl/LLVMUtils.cc
+ llvmopencl/LoopBarriers.cc
+ llvmopencl/ParallelRegion.cc
+ llvmopencl/PHIsToAllocas.cc
+ llvmopencl/TargetAddressSpaces.cc
+ llvmopencl/VariableUniformityAnalysis.cc
+ llvmopencl/WIVectorize.cc
+ llvmopencl/Workgroup.cc
+ llvmopencl/WorkItemAliasAnalysis.cc
+ llvmopencl/WorkitemHandler.cc
+ llvmopencl/WorkitemHandlerChooser.cc
+ llvmopencl/WorkitemLoops.cc
+ llvmopencl/WorkitemReplication.cc
+)
+endif(NOT SHAMROCK_BUILD)
+
+if (SHAMROCK_BUILD)
+add_subdirectory(builtins)
+endif()
+add_subdirectory(runtime)
+
+set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bsymbolic")
+add_library(OpenCL SHARED ${COAL_SRC_FILES})
+
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h
+ PROPERTIES GENERATED 1)
+
+add_dependencies(OpenCL generate_stdlib_c)
+
+if (NOT SHAMROCK_BUILD)
+add_dependencies(OpenCL generate_builtins)
+add_dependencies(OpenCL oclload)
+add_dependencies(OpenCL generate_dsp_builtins)
+add_subdirectory(core/dsp/ocl_load)
+else()
+add_dependencies(generate_builtin_lib generate_bc_files)
+add_dependencies(generate_stdlib_c generate_builtin_lib)
+endif (NOT SHAMROCK_BUILD)
+
+if (HAWKING_BUILD)
+ add_dependencies(OpenCL arm_clocl)
+endif()
+
+if (HAWKING_CROSS_COMPILE OR SHANNON_BUILD)
+ add_dependencies(OpenCL x86_clocl)
+endif()
+
+SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+
+SET_TARGET_PROPERTIES(OpenCL PROPERTIES
+ VERSION ${${PROJECT_NAME}_VERSION}
+ SOVERSION ${${PROJECT_NAME}_SOVERSION}
+)
+
+set_source_files_properties(${COAL_SRC_FILES}
+ PROPERTIES COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+
+set_target_properties(OpenCL PROPERTIES
+ LINK_FLAGS "${LLVM_LDFLAGS}"
+ LINK_INTERFACE_LIBRARIES "")
+
+set (LIBS
+ ${CLANG_LIBS}
+ ${LLVM_LIBS_CORE}
+ ${LLVM_LIBS_JIT}
+ pthread
+ rt
+ dl
+ z
+ tinfo
+ m
+)
+
+if (SHANNON_BUILD)
+ LIST (APPEND LIBS
+ ${PROJECT_BINARY_DIR}/lib/liboclload.a
+ ${SDK}/sdk/pciedrv/lib/pciedrv.a`
+ ${SDK}/sdk/dnldmgr/lib/dnldmgr.a
+ ${SDK}/sdk/cmem/lib/cmem_drv.a
+ ${SDK}/sdk/bufmgr/lib/bufmgr.a
+ ${SDK}/sdk/mailBox/host/lib/mailBox.a
+ pciaccess
+ )
+elseif(HAWKING_BUILD)
+ LIST (APPEND LIBS
+ ${PROJECT_BINARY_DIR}/lib/liboclload.a
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmmailbox.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmtransport.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmclient.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libticmem.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libkeystonemmap.so
+ # We don't really depend on libhyplnk but link against it
+ # to work around an mscsk issue.
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libhyplnk_k2h.so)
+endif()
+
+if (NOT SHAMROCK_BUILD)
+if (HAWKING_CROSS_COMPILE)
+ SET(FFI_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libffi.so.6)
+ SET(BFD_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libbfd.so)
+ SET(SQLITE3_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libsqlite3.so.0)
+else()
+ find_library(FFI_LIB ffi)
+ find_library(BFD_LIB bfd)
+ find_library(SQLITE3_LIB sqlite3)
+endif()
+
+LIST (APPEND LIBS ${FFI_LIB} ${BFD_LIB} ${SQLITE3_LIB})
+endif (NOT SHAMROCK_BUILD)
+
+TARGET_LINK_LIBRARIES(OpenCL ${LIBS})
+install(TARGETS OpenCL LIBRARY DESTINATION lib ${OCL_FPERMS})
diff --git a/src/api/api_command.cpp b/src/api/api_command.cpp
new file mode 100644
index 0000000..e9972c6
--- /dev/null
+++ b/src/api/api_command.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_command.cpp
+ * \brief Command queues
+ */
+
+#include <core/commandqueue.h>
+#include <core/deviceinterface.h>
+#include <core/context.h>
+
+#include <CL/cl.h>
+
+// Command Queue APIs
+cl_command_queue
+clCreateCommandQueue(cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int * errcode_ret)
+{
+ cl_int default_errcode_ret;
+
+ // No errcode_ret ?
+ if (!errcode_ret)
+ errcode_ret = &default_errcode_ret;
+
+ if (!device->isA(Coal::Object::T_Device))
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return 0;
+ }
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+ Coal::CommandQueue *queue = new Coal::CommandQueue(
+ (Coal::Context *)context,
+ (Coal::DeviceInterface *)device,
+ properties,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ // Initialization failed, destroy context
+ delete queue;
+ return 0;
+ }
+
+ return (_cl_command_queue *)queue;
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->flush();
+
+ if (command_queue->dereference())
+ delete command_queue;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ return command_queue->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetCommandQueueProperty(cl_command_queue command_queue,
+ cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties * old_properties)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ return command_queue->setProperty(properties, enable, old_properties);
+}
diff --git a/src/api/api_context.cpp b/src/api/api_context.cpp
new file mode 100644
index 0000000..abe7be6
--- /dev/null
+++ b/src/api/api_context.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_context.cpp
+ * \brief Contexts
+ */
+
+#include <CL/cl.h>
+#include <core/context.h>
+#include <core/platform.h>
+#include <stdlib.h>
+
+// Context APIs
+
+cl_context
+clCreateContext(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_int default_errcode_ret;
+
+ // No errcode_ret ?
+ if (!errcode_ret)
+ errcode_ret = &default_errcode_ret;
+
+ if (!devices ||
+ !num_devices ||
+ (!pfn_notify && user_data))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+ Coal::Context *ctx = new Coal::Context(properties, num_devices, devices,
+ pfn_notify, user_data, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ // Initialization failed, destroy context
+ delete ctx;
+ return 0;
+ }
+
+ return (_cl_context *)ctx;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties *properties,
+ cl_device_type device_type,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_device_id* devices;
+ cl_uint num_devices;
+ cl_int local_error;
+ cl_context result = NULL;
+
+ local_error = clGetDeviceIDs(&the_platform, device_type, 0, NULL,
+ &num_devices);
+ if (!num_devices) { local_error = CL_INVALID_DEVICE; goto bail; }
+
+ devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
+ if (!devices) { local_error = CL_OUT_OF_HOST_MEMORY; goto bail; }
+
+ local_error = clGetDeviceIDs(&the_platform, device_type, num_devices,
+ devices, 0);
+
+ if (local_error != CL_SUCCESS) { free (devices); goto bail; }
+
+ result = clCreateContext(properties, num_devices, devices, pfn_notify, user_data,
+ &local_error);
+
+ free (devices);
+
+bail:
+ if (errcode_ret)
+ *errcode_ret = local_error;
+
+ return result;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ context->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ if (context->dereference())
+ delete context;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetContextInfo(cl_context context,
+ cl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ return context->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_device.cpp b/src/api/api_device.cpp
new file mode 100644
index 0000000..052f0b4
--- /dev/null
+++ b/src/api/api_device.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_device.cpp
+ * \brief Devices
+ */
+
+#include "CL/cl.h"
+#include <core/platform.h>
+#include <core/deviceinterface.h>
+
+cl_int
+clGetDeviceIDs(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices)
+{
+ /*-------------------------------------------------------------------------
+ * We currently implement only one platform
+ *------------------------------------------------------------------------*/
+ if (!platform) platform = &the_platform;
+
+ if (platform != &the_platform) return CL_INVALID_PLATFORM;
+ if (num_entries == 0 && devices != 0) return CL_INVALID_VALUE;
+ if (num_devices == 0 && devices == 0) return CL_INVALID_VALUE;
+
+ int device_number = platform->getDevices(device_type,
+ num_entries, devices);
+
+ if (num_devices) *num_devices = device_number;
+
+ if (device_number == 0)
+ return CL_DEVICE_NOT_FOUND;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!device->isA(Coal::Object::T_Device))
+ return CL_INVALID_DEVICE;
+
+ Coal::DeviceInterface *iface = (Coal::DeviceInterface *)device;
+ return iface->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_enqueue.cpp b/src/api/api_enqueue.cpp
new file mode 100644
index 0000000..5ed3b1a
--- /dev/null
+++ b/src/api/api_enqueue.cpp
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_enqueue.cpp
+ * \brief Events
+ */
+
+#include <CL/cl.h>
+
+#include <core/events.h>
+#include <core/memobject.h>
+
+#include <cstdlib>
+#include <stdio.h>
+
+static inline cl_int queueEvent(Coal::CommandQueue *queue,
+ Coal::Event *command,
+ cl_event *event,
+ cl_bool blocking)
+{
+ cl_int rs;
+ Coal::Event *old_event = NULL;
+
+ if (event)
+ {
+#if 0
+ /*---------------------------------------------------------------------
+ * It is up to the user to release events for reuse. If they do not
+ * they will have a memory leak for old events. This can impact
+ * memory performance since the old event memory is likely already warm
+ * in cache.
+ *--------------------------------------------------------------------*/
+ /*---------------------------------------------------------------------
+ * We should also reduce the reference count of the old event, because
+ * user_app_event is now interested in a different event.
+ *--------------------------------------------------------------------*/
+ old_event = *event;
+ if (old_event != NULL && old_event->isA(Coal::Object::T_Event))
+ clReleaseEvent((cl_event)old_event);
+
+#endif
+ /*---------------------------------------------------------------------
+ * We need to increase reference count before queue->queueEvent(command)
+ * because a user_app_event is interested in the status of command.
+ * Otherwise, if worker thread runs too fast, command becomes COMPLETE
+ * before we get here, command would have been cleaned from queue and
+ * deleted!!! Thus we will be left with a dangling pointer.
+ *--------------------------------------------------------------------*/
+ *event = (cl_event)command;
+ command->reference();
+ }
+
+ /*------------------------------------------------------------------------
+ * Same reason as above. We need to retain command for clWaitForEvents().
+ *-----------------------------------------------------------------------*/
+ if (blocking) command->reference();
+
+ rs = queue->queueEvent(command);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ if (blocking)
+ {
+ rs = clWaitForEvents(1, (cl_event *)&command);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+ command->dereference();
+ }
+
+ return CL_SUCCESS;
+}
+
+// Enqueued Commands APIs
+cl_int
+clEnqueueReadBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::ReadBufferEvent *command = new Coal::ReadBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteBufferEvent *command = new Coal::WriteBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::ReadBufferRectEvent *command = new Coal::ReadBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteBufferRectEvent *command = new Coal::WriteBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferRectEvent *command = new Coal::CopyBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer,
+ (Coal::MemObject *)dst_buffer,
+ src_origin, dst_origin, region, src_row_pitch, src_slice_pitch,
+ dst_row_pitch, dst_slice_pitch, 1,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferEvent *command = new Coal::CopyBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer,
+ (Coal::MemObject *)dst_buffer,
+ src_offset, dst_offset, cb,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_read,
+ const size_t * origin,
+ const size_t * region,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ if (!image || (image->type() != Coal::MemObject::Image2D &&
+ image->type() != Coal::MemObject::Image3D))
+ return CL_INVALID_MEM_OBJECT;
+
+ Coal::ReadImageEvent *command = new Coal::ReadImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ origin, region, row_pitch, slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_write,
+ const size_t * origin,
+ const size_t * region,
+ size_t row_pitch,
+ size_t slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteImageEvent *command = new Coal::WriteImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ origin, region, row_pitch, slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_image,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyImageEvent *command = new Coal::CopyImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)src_image, (Coal::Image2D *)dst_image,
+ src_origin, dst_origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * region,
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyImageToBufferEvent *command = new Coal::CopyImageToBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)src_image, (Coal::MemObject *)dst_buffer,
+ src_origin, region, dst_offset,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_image,
+ size_t src_offset,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferToImageEvent *command = new Coal::CopyBufferToImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer, (Coal::Image2D *)dst_image,
+ src_offset, dst_origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ *errcode_ret = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ *errcode_ret = CL_INVALID_COMMAND_QUEUE;
+ return 0;
+ }
+
+ Coal::MapBufferEvent *command = new Coal::MapBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, map_flags,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ // We need command to be valid after queueEvent, so don't let the command
+ // queue handle it like a fire-and-forget event. Fixes a crash when event
+ // is NULL : the event gets deleted by clReleaseEvent called from
+ // CPUDevice's worker() and we then try to read it in command->ptr();
+ command->reference();
+
+ *errcode_ret = queueEvent(command_queue, command, event, blocking_map);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+ else
+ {
+ void *rs = command->ptr();
+
+ clReleaseEvent((cl_event)command);
+
+ return rs;
+ }
+}
+
+void *
+clEnqueueMapImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t * origin,
+ const size_t * region,
+ size_t * image_row_pitch,
+ size_t * image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int rs;
+
+ if (!errcode_ret)
+ errcode_ret = &rs;
+
+ *errcode_ret = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ *errcode_ret = CL_INVALID_COMMAND_QUEUE;
+ return 0;
+ }
+
+ Coal::MapImageEvent *command = new Coal::MapImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ map_flags, origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ if (!image_row_pitch ||
+ (image->type() == Coal::MemObject::Image3D && !image_slice_pitch))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ delete command;
+ return 0;
+ }
+
+ command->reference(); // See clEnqueueMapImage for explanation.
+ *errcode_ret = queueEvent(command_queue, command, event, blocking_map);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+ else
+ {
+ *image_row_pitch = command->row_pitch();
+
+ if (image_slice_pitch)
+ *image_slice_pitch = command->slice_pitch();
+
+ void *rs = command->ptr();
+
+ clReleaseEvent((cl_event)command);
+
+ return rs;
+ }
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+ cl_mem memobj,
+ void * mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::UnmapBufferEvent *command = new Coal::UnmapBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)memobj,
+ mapped_ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t * global_work_offset,
+ const size_t * global_work_size,
+ const size_t * local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::KernelEvent *command = new Coal::KernelEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Kernel *)kernel,
+ work_dim, global_work_offset, global_work_size, local_work_size,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueTask(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::TaskEvent *command = new Coal::TaskEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Kernel *)kernel,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue command_queue,
+ void (*user_func)(void *),
+ void * args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_list,
+ const void ** args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::NativeKernelEvent *command = new Coal::NativeKernelEvent(
+ (Coal::CommandQueue *)command_queue,
+ user_func, args, cb_args, num_mem_objects,
+ (const Coal::MemObject **)mem_list, args_mem_loc,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue command_queue,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ if (!event)
+ return CL_INVALID_VALUE;
+
+ // Get the events in command_queue
+ unsigned int count;
+ Coal::Event **events = command_queue->events(count, false);
+
+ Coal::MarkerEvent *command = new Coal::MarkerEvent(
+ (Coal::CommandQueue *)command_queue,
+ count, count == 0 ? NULL : (const Coal::Event **)events, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ // Free events, they were memcpyed by Coal::Event
+ for (unsigned int i=0; i<count; ++i)
+ {
+ events[i]->dereference();
+ }
+
+ if (events != NULL) std::free(events);
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+ cl_uint num_events,
+ const cl_event * event_list)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WaitForEventsEvent *command = new Coal::WaitForEventsEvent(
+ (Coal::CommandQueue *)command_queue,
+ num_events, (const Coal::Event **)event_list, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, 0, false);
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::BarrierEvent *command = new Coal::BarrierEvent(
+ (Coal::CommandQueue *)command_queue, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, 0, false);
+}
diff --git a/src/api/api_event.cpp b/src/api/api_event.cpp
new file mode 100644
index 0000000..1e882bf
--- /dev/null
+++ b/src/api/api_event.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_event.cpp
+ * \brief Special events and event management
+ */
+
+#include <CL/cl.h>
+
+#include <core/commandqueue.h>
+#include <core/events.h>
+#include <core/context.h>
+#include <stdio.h>
+
+// Event Object APIs
+cl_int
+clWaitForEvents(cl_uint num_events,
+ const cl_event * event_list)
+{
+ if (!num_events || !event_list)
+ return CL_INVALID_VALUE;
+
+ // Check the events in the list
+ cl_context global_ctx = 0;
+
+ for (cl_uint i=0; i<num_events; ++i)
+ {
+ if (!event_list[i]->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (event_list[i]->status() < 0)
+ return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+
+ cl_context evt_ctx = (cl_context)event_list[i]->parent()->parent();
+
+#if 0 // YUAN: no need to wait for queue to be flushed
+ cl_command_queue evt_queue = (cl_command_queue)event_list[i]->parent();
+ // Flush the queue
+ evt_queue->flush();
+#endif
+
+ if (global_ctx == 0)
+ global_ctx = evt_ctx;
+ else if (global_ctx != evt_ctx)
+ return CL_INVALID_CONTEXT;
+ }
+
+ // Wait for the events
+ for (cl_uint i=0; i<num_events; ++i)
+ {
+ event_list[i]->waitForStatus(Coal::Event::Complete);
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetEventInfo(cl_event event,
+ cl_event_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ return event->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetEventCallback(cl_event event,
+ cl_int command_exec_callback_type,
+ void (CL_CALLBACK *pfn_event_notify)(cl_event event,
+ cl_int exec_status,
+ void *user_data),
+ void *user_data)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (!pfn_event_notify || command_exec_callback_type != CL_COMPLETE)
+ return CL_INVALID_VALUE;
+
+ event->setCallback(command_exec_callback_type, pfn_event_notify, user_data);
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clRetainEvent(cl_event event)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ event->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseEvent(cl_event event)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (event->dereference())
+ {
+ event->freeDeviceData();
+ delete event;
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_event
+clCreateUserEvent(cl_context context,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::UserEvent *command = new Coal::UserEvent(
+ (Coal::Context *)context, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ return (cl_event)command;
+}
+
+cl_int
+clSetUserEventStatus(cl_event event,
+ cl_int execution_status)
+{
+ Coal::Event *command = (Coal::Event *)event;
+
+ if (!command->isA(Coal::Object::T_Event) ||
+ command->type() != Coal::Event::User)
+ return CL_INVALID_EVENT;
+
+ if (execution_status != CL_COMPLETE)
+ return CL_INVALID_VALUE;
+
+ if (command->status() != CL_SUBMITTED)
+ return CL_INVALID_OPERATION;
+
+ command->setStatus((Coal::Event::Status)execution_status);
+
+ return CL_SUCCESS;
+}
diff --git a/src/api/api_flush.cpp b/src/api/api_flush.cpp
new file mode 100644
index 0000000..c0e93a7
--- /dev/null
+++ b/src/api/api_flush.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_flush.cpp
+ * \brief clFlush and clFinish
+ */
+
+#include "CL/cl.h"
+#include "core/commandqueue.h"
+
+// Flush and Finish APIs
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->flush();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->finish();
+
+ return CL_SUCCESS;
+}
diff --git a/src/api/api_gl.cpp b/src/api/api_gl.cpp
new file mode 100644
index 0000000..0f06499
--- /dev/null
+++ b/src/api/api_gl.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_gl.cpp
+ * \brief OpenGL bindings (unimplemented)
+ */
+
+#define GL_GLEXT_PROTOTYPES
+#include "GL/gl.h"
+#include "GL/glext.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+
+cl_mem
+clCreateFromGLBuffer(cl_context context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLTexture2D(cl_context context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texture,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLTexture3D(cl_context context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texture,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLRenderbuffer(cl_context context,
+ cl_mem_flags flags,
+ GLuint renderbuffer,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_int
+clGetGLObjectInfo(cl_mem memobj,
+ cl_gl_object_type * gl_object_type,
+ GLuint * gl_object_name)
+{
+ return 0;
+}
+
+cl_int
+clGetGLTextureInfo(cl_mem memobj,
+ cl_gl_texture_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return 0;
+}
+
+cl_int
+clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ return 0;
+}
+
+cl_int
+clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ return 0;
+}
diff --git a/src/api/api_kernel.cpp b/src/api/api_kernel.cpp
new file mode 100644
index 0000000..abc492b
--- /dev/null
+++ b/src/api/api_kernel.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_kernel.cpp
+ * \brief Kernels
+ */
+
+#include "CL/cl.h"
+
+#include <core/program.h>
+#include <core/kernel.h>
+
+// Kernel Object APIs
+cl_kernel
+clCreateKernel(cl_program program,
+ const char * kernel_name,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!kernel_name)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ if (!program->isA(Coal::Object::T_Program))
+ {
+ *errcode_ret = CL_INVALID_PROGRAM;
+ return 0;
+ }
+
+ if (program->state() != Coal::Program::Built)
+ {
+ *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE;
+ return 0;
+ }
+
+ //Coal::Kernel *kernel = program->createKernel(kernel_name, errcode_ret);
+ Coal::Kernel *kernel = program->createKernelsAndReturnKernel(kernel_name, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete kernel;
+ return 0;
+ }
+
+ return (cl_kernel)kernel;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (program->state() != Coal::Program::Built)
+ return CL_INVALID_PROGRAM_EXECUTABLE;
+
+ std::vector<Coal::Kernel *> ks = program->createKernels(&rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ }
+
+ return rs;
+ }
+
+ // Check that the kernels will fit in the array, if needed
+ if (num_kernels_ret)
+ *num_kernels_ret = ks.size();
+
+ if (kernels && num_kernels < ks.size())
+ {
+ while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ }
+
+ return CL_INVALID_VALUE;
+ }
+
+ if (!kernels)
+ {
+ // We don't need the kernels in fact
+ /* while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ } */
+ }
+ else
+ {
+ // Copy the kernels
+ for (size_t i=0; i<ks.size(); ++i)
+ {
+ kernels[i] = (cl_kernel)ks[i];
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clRetainKernel(cl_kernel kernel)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ kernel->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ if (kernel->dereference())
+ {
+ Coal::Program *p =(Coal::Program *) kernel->parent();
+
+ for (size_t i=0; i < p->kernelList.size(); i++)
+ {
+ if (p->kernelList[i]->p_name.compare(kernel->p_name) == 0)
+ {
+ p->kernelReleasedList.push_back(p->kernelList[i]);
+ p->kernelList.erase(p->kernelList.begin() + i);
+ // BUG: TAG
+ // For some odd reason when we delete this, we're corrupting then inside of some kernel objects
+ //delete kernel;
+ }
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clSetKernelArg(cl_kernel kernel,
+ cl_uint arg_indx,
+ size_t arg_size,
+ const void * arg_value)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->setArg(arg_indx, arg_size, arg_value);
+}
+
+cl_int
+clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->workGroupInfo((Coal::DeviceInterface *)device, param_name,
+ param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_memory.cpp b/src/api/api_memory.cpp
new file mode 100644
index 0000000..18e6bab
--- /dev/null
+++ b/src/api/api_memory.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_memory.cpp
+ * \brief Memory objects
+ */
+
+#include "CL/cl.h"
+#include <core/memobject.h>
+#include <core/context.h>
+
+#include <cstring>
+
+// Memory Object APIs
+cl_mem
+clCreateBuffer(cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Buffer *buf = new Coal::Buffer(context, size, host_ptr, flags,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS)
+ {
+ delete buf;
+ return 0;
+ }
+
+ return (cl_mem)buf;
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem buffer,
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!buffer->isA(Coal::Object::T_MemObject))
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return 0;
+ }
+
+ Coal::MemObject *memobject = (Coal::MemObject *)buffer;
+ cl_buffer_region *region = (cl_buffer_region *)buffer_create_info;
+
+ // NOTE: Is it right ? Couldn't we create SubBuffers of images ?
+ if (memobject->type() != Coal::MemObject::Buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return 0;
+ }
+
+ if (buffer_create_type != CL_BUFFER_CREATE_TYPE_REGION)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ if (!buffer_create_info)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::SubBuffer *buf = new Coal::SubBuffer((Coal::Buffer *)buffer,
+ region->origin, region->size,
+ flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS)
+ {
+ delete buf;
+ return 0;
+ }
+
+ return (cl_mem)buf;
+}
+
+cl_mem
+clCreateImage2D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_row_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Image2D *image = new Coal::Image2D(context, image_width, image_height,
+ image_row_pitch, image_format,
+ host_ptr, flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS)
+ {
+ delete image;
+ return 0;
+ }
+
+ return (cl_mem)image;
+}
+
+cl_mem
+clCreateImage3D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_depth,
+ size_t image_row_pitch,
+ size_t image_slice_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Image3D *image = new Coal::Image3D(context, image_width, image_height,
+ image_depth, image_row_pitch,
+ image_slice_pitch, image_format,
+ host_ptr, flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS)
+ {
+ delete image;
+ return 0;
+ }
+
+ return (cl_mem)image;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ memobj->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ if (memobj->dereference())
+ delete memobj;
+
+ return CL_SUCCESS;
+}
+
+static cl_image_format supported_formats[] = {
+ { CL_RGBA, CL_UNORM_INT8 },
+ { CL_RGBA, CL_UNORM_INT16 },
+ { CL_RGBA, CL_SNORM_INT8 },
+ { CL_RGBA, CL_SNORM_INT16 },
+ { CL_RGBA, CL_SIGNED_INT8 },
+ { CL_RGBA, CL_SIGNED_INT16 },
+ { CL_RGBA, CL_SIGNED_INT32 },
+ { CL_RGBA, CL_UNSIGNED_INT8 },
+ { CL_RGBA, CL_UNSIGNED_INT16 },
+ { CL_RGBA, CL_UNSIGNED_INT32 },
+ { CL_RGBA, CL_FLOAT },
+
+ { CL_ARGB, CL_UNORM_INT8 },
+ { CL_ARGB, CL_SNORM_INT8 },
+ { CL_ARGB, CL_SIGNED_INT8 },
+ { CL_ARGB, CL_UNSIGNED_INT8 },
+
+ { CL_BGRA, CL_UNORM_INT8 },
+ { CL_BGRA, CL_SNORM_INT8 },
+ { CL_BGRA, CL_SIGNED_INT8 },
+ { CL_BGRA, CL_UNSIGNED_INT8 },
+
+ { CL_RGB, CL_UNORM_SHORT_565 },
+ { CL_RGB, CL_UNORM_SHORT_555 },
+ { CL_RGB, CL_UNORM_INT_101010 },
+
+ { CL_RGBx, CL_UNORM_SHORT_565 },
+ { CL_RGBx, CL_UNORM_SHORT_555 },
+ { CL_RGBx, CL_UNORM_INT_101010 },
+
+ { CL_RG, CL_UNORM_INT8 },
+ { CL_RG, CL_UNORM_INT16 },
+ { CL_RG, CL_SNORM_INT8 },
+ { CL_RG, CL_SNORM_INT16 },
+ { CL_RG, CL_SIGNED_INT8 },
+ { CL_RG, CL_SIGNED_INT16 },
+ { CL_RG, CL_SIGNED_INT32 },
+ { CL_RG, CL_UNSIGNED_INT8 },
+ { CL_RG, CL_UNSIGNED_INT16 },
+ { CL_RG, CL_UNSIGNED_INT32 },
+ { CL_RG, CL_FLOAT },
+
+ { CL_RGx, CL_UNORM_INT8 },
+ { CL_RGx, CL_UNORM_INT16 },
+ { CL_RGx, CL_SNORM_INT8 },
+ { CL_RGx, CL_SNORM_INT16 },
+ { CL_RGx, CL_SIGNED_INT8 },
+ { CL_RGx, CL_SIGNED_INT16 },
+ { CL_RGx, CL_SIGNED_INT32 },
+ { CL_RGx, CL_UNSIGNED_INT8 },
+ { CL_RGx, CL_UNSIGNED_INT16 },
+ { CL_RGx, CL_UNSIGNED_INT32 },
+ { CL_RGx, CL_FLOAT },
+
+ { CL_RA, CL_UNORM_INT8 },
+ { CL_RA, CL_UNORM_INT16 },
+ { CL_RA, CL_SNORM_INT8 },
+ { CL_RA, CL_SNORM_INT16 },
+ { CL_RA, CL_SIGNED_INT8 },
+ { CL_RA, CL_SIGNED_INT16 },
+ { CL_RA, CL_SIGNED_INT32 },
+ { CL_RA, CL_UNSIGNED_INT8 },
+ { CL_RA, CL_UNSIGNED_INT16 },
+ { CL_RA, CL_UNSIGNED_INT32 },
+ { CL_RA, CL_FLOAT },
+
+ { CL_R, CL_UNORM_INT8 },
+ { CL_R, CL_UNORM_INT16 },
+ { CL_R, CL_SNORM_INT8 },
+ { CL_R, CL_SNORM_INT16 },
+ { CL_R, CL_SIGNED_INT8 },
+ { CL_R, CL_SIGNED_INT16 },
+ { CL_R, CL_SIGNED_INT32 },
+ { CL_R, CL_UNSIGNED_INT8 },
+ { CL_R, CL_UNSIGNED_INT16 },
+ { CL_R, CL_UNSIGNED_INT32 },
+ { CL_R, CL_FLOAT },
+
+ { CL_Rx, CL_UNORM_INT8 },
+ { CL_Rx, CL_UNORM_INT16 },
+ { CL_Rx, CL_SNORM_INT8 },
+ { CL_Rx, CL_SNORM_INT16 },
+ { CL_Rx, CL_SIGNED_INT8 },
+ { CL_Rx, CL_SIGNED_INT16 },
+ { CL_Rx, CL_SIGNED_INT32 },
+ { CL_Rx, CL_UNSIGNED_INT8 },
+ { CL_Rx, CL_UNSIGNED_INT16 },
+ { CL_Rx, CL_UNSIGNED_INT32 },
+ { CL_Rx, CL_FLOAT },
+
+ { CL_A, CL_UNORM_INT8 },
+ { CL_A, CL_UNORM_INT16 },
+ { CL_A, CL_SNORM_INT8 },
+ { CL_A, CL_SNORM_INT16 },
+ { CL_A, CL_SIGNED_INT8 },
+ { CL_A, CL_SIGNED_INT16 },
+ { CL_A, CL_SIGNED_INT32 },
+ { CL_A, CL_UNSIGNED_INT8 },
+ { CL_A, CL_UNSIGNED_INT16 },
+ { CL_A, CL_UNSIGNED_INT32 },
+ { CL_A, CL_FLOAT },
+
+ { CL_LUMINANCE, CL_UNORM_INT8 },
+ { CL_LUMINANCE, CL_UNORM_INT16 },
+ { CL_LUMINANCE, CL_SNORM_INT8 },
+ { CL_LUMINANCE, CL_SNORM_INT16 },
+ { CL_LUMINANCE, CL_FLOAT },
+
+ { CL_INTENSITY, CL_UNORM_INT8 },
+ { CL_INTENSITY, CL_UNORM_INT16 },
+ { CL_INTENSITY, CL_SNORM_INT8 },
+ { CL_INTENSITY, CL_SNORM_INT16 },
+ { CL_INTENSITY, CL_FLOAT }
+};
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+cl_int
+clGetSupportedImageFormats(cl_context context,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ (void) flags;
+ (void) image_type;
+
+ if (!num_entries && image_formats)
+ return CL_INVALID_VALUE;
+
+ if (image_formats)
+ {
+ std::memcpy(image_formats, supported_formats,
+ MIN(num_entries * sizeof(cl_image_format),
+ sizeof(supported_formats)));
+ }
+
+ if (num_image_formats)
+ *num_image_formats = sizeof(supported_formats) / sizeof(cl_image_format);
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ return memobj->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetImageInfo(cl_mem image,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!image->isA(Coal::Object::T_MemObject) ||
+ (image->type() != Coal::MemObject::Image2D &&
+ image->type() != Coal::MemObject::Image3D))
+ return CL_INVALID_MEM_OBJECT;
+
+ Coal::Image2D *image2d = (Coal::Image2D *)image;
+
+ return image2d->imageInfo(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetMemObjectDestructorCallback(cl_mem memobj,
+ void (CL_CALLBACK *pfn_notify)(cl_mem memobj,
+ void *user_data),
+ void * user_data)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ memobj->setDestructorCallback(pfn_notify, user_data);
+
+ return CL_SUCCESS;
+}
+
diff --git a/src/api/api_platform.cpp b/src/api/api_platform.cpp
new file mode 100644
index 0000000..cf064ef
--- /dev/null
+++ b/src/api/api_platform.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_platform.cpp
+ * \brief Platform
+ */
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <core/platform.h>
+#include <core/config.h>
+#include <cstring>
+
+// Platform API
+
+cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if (num_platforms) *num_platforms = 1;
+ else if (!platforms) return CL_INVALID_VALUE;
+
+ if (!num_entries && platforms) return CL_INVALID_VALUE;
+
+ /*-------------------------------------------------------------------------
+ * Only one "default" platform
+ *------------------------------------------------------------------------*/
+ if (platforms != 0) *platforms = &the_platform;
+
+ return CL_SUCCESS;
+}
+
+cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ const char *string = 0;
+ unsigned long len = 0;
+
+ /*-------------------------------------------------------------------------
+ * NULL or what is returned by clGetPlatformIDs, that's to say also NULL
+ *------------------------------------------------------------------------*/
+ if (platform != &the_platform) return CL_INVALID_PLATFORM;
+
+ return platform->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+/******************************************************************************
+* Return a pointer to any supported extension functions
+******************************************************************************/
+void * clGetExtensionFunctionAddress(const char *funcname)
+{
+ if (strcmp(funcname, "clIcdGetPlatformIDsKHR") == 0)
+ return (void*)clGetPlatformIDs;
+
+ return NULL;
+}
+
diff --git a/src/api/api_profiling.cpp b/src/api/api_profiling.cpp
new file mode 100644
index 0000000..0abec66
--- /dev/null
+++ b/src/api/api_profiling.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_profiling.cpp
+ * \brief Profiling of events
+ */
+
+#include "CL/cl.h"
+#include <core/commandqueue.h>
+
+// Profiling APIs
+cl_int
+clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ return event->profilingInfo(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
diff --git a/src/api/api_program.cpp b/src/api/api_program.cpp
new file mode 100644
index 0000000..af30510
--- /dev/null
+++ b/src/api/api_program.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_program.cpp
+ * \brief Programs
+ */
+
+#include "CL/cl.h"
+#include <core/program.h>
+#include <core/context.h>
+
+#include <cstdlib>
+
+// Program Object APIs
+cl_program
+clCreateProgramWithSource(cl_context context,
+ cl_uint count,
+ const char ** strings,
+ const size_t * lengths,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ if (!count || !strings)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ Coal::Program *program = new Coal::Program(context);
+
+ *errcode_ret = CL_SUCCESS;
+ *errcode_ret = program->loadSources(count, strings, lengths);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete program;
+ return 0;
+ }
+
+ return (cl_program)program;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const size_t * lengths,
+ const unsigned char **binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ if (!num_devices || !device_list || !lengths || !binaries)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ // Check the devices for compliance
+ cl_uint context_num_devices = 0;
+ cl_device_id *context_devices;
+
+ *errcode_ret = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint),
+ &context_num_devices, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return 0;
+
+ context_devices =
+ (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id));
+
+ *errcode_ret = context->info(CL_CONTEXT_DEVICES,
+ context_num_devices * sizeof(cl_device_id),
+ context_devices, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return 0;
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ bool found = false;
+
+ if (!lengths[i] || !binaries[i])
+ {
+ if (binary_status)
+ binary_status[i] = CL_INVALID_VALUE;
+
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ for (cl_uint j=0; j<context_num_devices; ++j)
+ {
+ if (device_list[i] == context_devices[j])
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return 0;
+ }
+ }
+
+ // Create a program
+ Coal::Program *program = new Coal::Program(context);
+ *errcode_ret = CL_SUCCESS;
+
+ // Init program
+ *errcode_ret = program->loadBinaries(binaries,
+ lengths, binary_status, num_devices,
+ (Coal::DeviceInterface * const*)device_list);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete program;
+ return 0;
+ }
+
+ return (cl_program)program;
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ program->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (program->dereference())
+ delete program;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clBuildProgram(cl_program program,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ void (*pfn_notify)(cl_program program, void * user_data),
+ void * user_data)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (!device_list && num_devices > 0)
+ return CL_INVALID_VALUE;
+
+ if (!num_devices && device_list)
+ return CL_INVALID_VALUE;
+
+ if (!pfn_notify && user_data)
+ return CL_INVALID_VALUE;
+
+ cl_uint context_num_devices = 0;
+ cl_device_id *context_devices;
+ Coal::Context *context = (Coal::Context *)program->parent();
+ cl_int result;
+
+ result = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint),
+ &context_num_devices, 0);
+
+ if (result != CL_SUCCESS) return result;
+
+ context_devices =
+ (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id));
+
+ result = context->info(CL_CONTEXT_DEVICES,
+ context_num_devices * sizeof(cl_device_id),
+ context_devices, 0);
+
+ if (result != CL_SUCCESS) return result;
+
+
+ // Check the devices for compliance
+ if (num_devices)
+ {
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ bool found = false;
+
+ for (cl_uint j=0; j<context_num_devices; ++j)
+ {
+ if (device_list[i] == context_devices[j])
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return CL_INVALID_DEVICE;
+ }
+ }
+ else
+ {
+ num_devices = context_num_devices;
+ device_list = context_devices;
+ }
+
+ // We cannot try to build a previously-failed program
+ if (!(program->state() == Coal::Program::Loaded ||
+ program->state() == Coal::Program::Built ))
+ return CL_INVALID_OPERATION;
+
+ // Build program
+ return program->build(options, pfn_notify, user_data, num_devices,
+ (Coal::DeviceInterface * const*)device_list);
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetProgramInfo(cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ return program->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (!device)
+ return CL_INVALID_DEVICE;
+
+ return program->buildInfo((Coal::DeviceInterface *)device, param_name,
+ param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_sampler.cpp b/src/api/api_sampler.cpp
new file mode 100644
index 0000000..9bd2dec
--- /dev/null
+++ b/src/api/api_sampler.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_sampler.cpp
+ * \brief Samplers
+ */
+
+#include "CL/cl.h"
+
+#include "core/sampler.h"
+#include "core/context.h"
+
+// Sampler APIs
+cl_sampler
+clCreateSampler(cl_context context,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Sampler *sampler = new Coal::Sampler((Coal::Context *)context,
+ normalized_coords,
+ addressing_mode,
+ filter_mode,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete sampler;
+ return 0;
+ }
+
+ return (cl_sampler)sampler;
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ sampler->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ if (sampler->dereference())
+ delete sampler;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler sampler,
+ cl_sampler_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ return sampler->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/builtins/CMakeLists.txt b/src/builtins/CMakeLists.txt
new file mode 100644
index 0000000..a83dfdf
--- /dev/null
+++ b/src/builtins/CMakeLists.txt
@@ -0,0 +1,33 @@
+if (SHAMROCK_BUILD)
+
+set(CUSTOM_COMMAND ${CLANG_EXECUTABLE} -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off )
+
+FILE(GLOB CL_SOURCES ${CLC_BUILTINS_DIR}/*.cl)
+#MESSAGE(STATUS "CL_SOURCES: ${CL_SOURCES}" )
+
+set(BC_SOURCES)
+foreach(f ${CL_SOURCES})
+ get_filename_component(fn ${f} NAME_WE)
+ #MESSAGE(STATUS "CL_SOURCE: ${f}" )
+ set(bc ${CMAKE_CURRENT_BINARY_DIR}/${fn}.bc)
+ add_custom_command(OUTPUT ${bc}
+ COMMAND ${CUSTOM_COMMAND}
+ -I${OCL_BUILTINS_DIR}/include
+ -o ${bc} ${f}
+ DEPENDS ${f}
+ COMMENT "Generating ${bc}")
+ list(APPEND BC_SOURCES ${bc})
+endforeach()
+#MESSAGE( STATUS "BC_SOURCES: ${BC_SOURCES}")
+
+add_custom_target(generate_bc_files DEPENDS ${BC_SOURCES})
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib
+ COMMAND llvm-link
+ -o ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib ${BC_SOURCES}
+ DEPENDS ${BC_SOURCES} )
+
+add_custom_target(generate_builtin_lib DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib)
+
+endif(SHAMROCK_BUILD)
diff --git a/src/builtins/Makefile b/src/builtins/Makefile
new file mode 100644
index 0000000..1d3349b
--- /dev/null
+++ b/src/builtins/Makefile
@@ -0,0 +1,24 @@
+CLANG = clang
+CLANG_CFLAGS = -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc
+CLANG_CFLAGS += -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off
+CLANG_CFLAGS += -I../../include
+
+CL_FILES = $(wildcard *.cl)
+BYTECODE := ${CL_FILES:.cl=.bc}
+
+all: builtins.lib
+
+builtins.lib: $(BYTECODE)
+ @echo $@ Linking bytecode modules
+ llvm-link -o $@ $^
+
+%.bc: %.cl
+ @echo $< Parsing
+ @$(CLANG) $(CLANG_CFLAGS) $< -o $@
+
+%.ll: %.bc
+ @echo $< Disassembling
+ llvm-dis $<
+
+clean:
+ @rm -rf *.bc *.ll
diff --git a/src/builtins/README.txt b/src/builtins/README.txt
new file mode 100644
index 0000000..5e16118
--- /dev/null
+++ b/src/builtins/README.txt
@@ -0,0 +1,13 @@
+This directory (builtins) is intended to supercede src/runtime as a means
+to provide a builtins library for OpenCL kernels.
+
+Note: some of the files here do not compile due to an address space casting
+error, and are suffixed *.cl.broken.
+
+Files here were imported from the TI opencl_builtins private repository and
+repurposed for CPU device (from DSP device).
+
+This library appears to have been adapted from libclc.llvm.org.
+
+The Makefile here is not used, but available for illustration purposes and
+to allow disassmbly of the bc files for inspection.
diff --git a/src/builtins/abs.cl b/src/builtins/abs.cl
new file mode 100644
index 0000000..71dcf75
--- /dev/null
+++ b/src/builtins/abs.cl
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+UNARY_VEC_DEF(char, uchar, abs, abs)
+UNARY_VEC_DEF(short, ushort, abs, abs)
+UNARY_VEC_DEF(int, uint, abs, abs)
+UNARY_VEC_DEF(long, ulong, abs, abs)
diff --git a/src/builtins/abs_diff.cl b/src/builtins/abs_diff.cl
new file mode 100644
index 0000000..ecc8e37
--- /dev/null
+++ b/src/builtins/abs_diff.cl
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+#define EXPAND_SIZES(type, utype) \
+ TEMPLATE(_VEC_TYPE(type,2), _VEC_TYPE(utype,2)) \
+ TEMPLATE(_VEC_TYPE(type,3), _VEC_TYPE(utype,3)) \
+ TEMPLATE(_VEC_TYPE(type,4), _VEC_TYPE(utype,4)) \
+ TEMPLATE(_VEC_TYPE(type,8), _VEC_TYPE(utype,8)) \
+ TEMPLATE(_VEC_TYPE(type,16), _VEC_TYPE(utype,16)) \
+
+#define TEMPLATE(gentype, ugentype) \
+ _CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \
+ { return __builtin_astype(x > y ? x-y : y-x, ugentype); }
+
+EXPAND_SIZES(uchar, uchar)
+EXPAND_SIZES(char, uchar)
+EXPAND_SIZES(ushort, ushort)
+EXPAND_SIZES(short, ushort)
+EXPAND_SIZES(uint, uint)
+EXPAND_SIZES(ulong, ulong)
+
+#undef TEMPLATE
+
+#define TEMPLATE(gentype, ugentype, shiftval) \
+_CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \
+{ \
+ gentype signs_differ = (x^y) >> (gentype)shiftval; \
+ return (signs_differ) ? abs(x) + abs(y) : \
+ __builtin_astype(x > y ? x-y : y-x, ugentype); \
+}
+
+TEMPLATE(int, uint, 31)
+TEMPLATE(_VEC_TYPE(int,2), _VEC_TYPE(uint,2), 31)
+TEMPLATE(_VEC_TYPE(int,3), _VEC_TYPE(uint,3), 31)
+TEMPLATE(_VEC_TYPE(int,4), _VEC_TYPE(uint,4), 31)
+TEMPLATE(_VEC_TYPE(int,8), _VEC_TYPE(uint,8), 31)
+TEMPLATE(_VEC_TYPE(int,16), _VEC_TYPE(uint,16), 31)
+
+TEMPLATE(long, ulong, 63)
+TEMPLATE(_VEC_TYPE(long,2), _VEC_TYPE(ulong,2), 63)
+TEMPLATE(_VEC_TYPE(long,3), _VEC_TYPE(ulong,3), 63)
+TEMPLATE(_VEC_TYPE(long,4), _VEC_TYPE(ulong,4), 63)
+TEMPLATE(_VEC_TYPE(long,8), _VEC_TYPE(ulong,8), 63)
+TEMPLATE(_VEC_TYPE(long,16), _VEC_TYPE(ulong,16), 63)
+
+#undef TEMPLATE
diff --git a/src/builtins/add_sat.cl b/src/builtins/add_sat.cl
new file mode 100644
index 0000000..e70b3fb
--- /dev/null
+++ b/src/builtins/add_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+BINARY_VEC_DEF(char, char, add_sat, add_sat)
+BINARY_VEC_DEF(uchar, uchar, add_sat, add_sat)
+BINARY_VEC_DEF(short, short, add_sat, add_sat)
+BINARY_VEC_DEF(ushort, ushort, add_sat, add_sat)
+BINARY_VEC_DEF(int, int, add_sat, add_sat)
+BINARY_VEC_DEF(uint, uint, add_sat, add_sat)
+BINARY_VEC_DEF(long, long, add_sat, add_sat)
+BINARY_VEC_DEF(ulong, ulong, add_sat, add_sat)
diff --git a/src/builtins/all.cl b/src/builtins/all.cl
new file mode 100644
index 0000000..96a9ee2
--- /dev/null
+++ b/src/builtins/all.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_DEF int all(type##3 x) { return (x.s0 & x.s1 & x.s2) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##4 x) { return (x.s0 & x.s1 & x.s2 & x.s3) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##8 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \
+ x.s4 & x.s5 & x.s6 & x.s7) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##16 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \
+ x.s4 & x.s5 & x.s6 & x.s7 & \
+ x.s8 & x.s9 & x.sa & x.sb & \
+ x.sc & x.sd & x.se & x.sf) < 0; } \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
diff --git a/src/builtins/any.cl b/src/builtins/any.cl
new file mode 100644
index 0000000..57c4419
--- /dev/null
+++ b/src/builtins/any.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_DEF int any(type##3 x) { return (x.s0 | x.s1 | x.s2) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##4 x) { return (x.s0 | x.s1 | x.s2 | x.s3) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##8 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \
+ x.s4 | x.s5 | x.s6 | x.s7) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##16 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \
+ x.s4 | x.s5 | x.s6 | x.s7 | \
+ x.s8 | x.s9 | x.sa | x.sb | \
+ x.sc | x.sd | x.se | x.sf) < 0; } \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
diff --git a/src/builtins/atomics.cl.broken b/src/builtins/atomics.cl.broken
new file mode 100644
index 0000000..ed46888
--- /dev/null
+++ b/src/builtins/atomics.cl.broken
@@ -0,0 +1,558 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+void __sem_lock(int);
+void __sem_unlock(int);
+void __inv(char*, int);
+
+#define LOCK_GLOBAL __sem_lock(1)
+#define UNLOCK_GLOBAL __sem_unlock(1)
+#define INV_GLOBAL(p, sz) __inv((char*)(p), (sz))
+#define WB_GLOBAL(p, sz)
+
+#define LOCK_LOCAL
+#define UNLOCK_LOCAL
+#define INV_LOCAL(p, sz)
+#define WB_LOCAL(p, sz)
+
+_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old + val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old + val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old + val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old + val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old - val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old - val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old - val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old - val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float* p, float val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ float old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float* p, float val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ float old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile global int* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old + 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile global uint* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old + 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile local int* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old + 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile local uint* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old + 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile global int* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old - 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile global uint* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old - 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile local int* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old - 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile local uint* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old - 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile global int* p, int cmp, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (old == cmp) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile global uint* p, uint cmp, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (old == cmp) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile local int* p, int cmp, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (old == cmp) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile local uint* p, uint cmp, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (old == cmp) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (val < old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (val < old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (val < old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (val < old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (val > old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (val > old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (val > old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (val > old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old & val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old & val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old & val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old & val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old | val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old | val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old | val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old | val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old ^ val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old ^ val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old ^ val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old ^ val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
diff --git a/src/builtins/bitselect.cl b/src/builtins/bitselect.cl
new file mode 100644
index 0000000..bf93a47
--- /dev/null
+++ b/src/builtins/bitselect.cl
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define DEFN(tname) \
+_CLC_OVERLOAD _CLC_DEF tname bitselect(tname a, tname b, tname c) { return a^(c&(b^a)); }
+
+DEFN(char2)
+DEFN(uchar2)
+DEFN(long2)
+DEFN(ulong2)
+
+DEFN(char3)
+DEFN(uchar3)
+DEFN(short3)
+DEFN(ushort3)
+DEFN(int3)
+DEFN(uint3)
+DEFN(long3)
+DEFN(ulong3)
+
+DEFN(int4)
+DEFN(uint4)
+DEFN(long4)
+DEFN(ulong4)
+
+DEFN(short8)
+DEFN(ushort8)
+DEFN(int8)
+DEFN(uint8)
+DEFN(long8)
+DEFN(ulong8)
+
+DEFN(char16)
+DEFN(uchar16)
+DEFN(short16)
+DEFN(ushort16)
+DEFN(int16)
+DEFN(uint16)
+DEFN(long16)
+DEFN(ulong16)
+
+_CLC_OVERLOAD _CLC_DEF float bitselect (float a, float b, float c)
+{ return __builtin_astype(__builtin_astype(a,int)^(__builtin_astype(c,int)&(__builtin_astype(b,int)^__builtin_astype(a,int))), float); }
+_CLC_OVERLOAD _CLC_DEF float2 bitselect (float2 a, float2 b, float2 c)
+{ return __builtin_astype(__builtin_astype(a,int2)^(__builtin_astype(c,int2)&(__builtin_astype(b,int2)^__builtin_astype(a,int2))), float2); }
+_CLC_OVERLOAD _CLC_DEF float3 bitselect (float3 a, float3 b, float3 c)
+{ return __builtin_astype(__builtin_astype(a,int3)^(__builtin_astype(c,int3)&(__builtin_astype(b,int3)^__builtin_astype(a,int3))), float3); }
+_CLC_OVERLOAD _CLC_DEF float4 bitselect (float4 a, float4 b, float4 c)
+{ return __builtin_astype(__builtin_astype(a,int4)^(__builtin_astype(c,int4)&(__builtin_astype(b,int4)^__builtin_astype(a,int4))), float4); }
+_CLC_OVERLOAD _CLC_DEF float8 bitselect (float8 a, float8 b, float8 c)
+{ return __builtin_astype(__builtin_astype(a,int8)^(__builtin_astype(c,int8)&(__builtin_astype(b,int8)^__builtin_astype(a,int8))), float8); }
+_CLC_OVERLOAD _CLC_DEF float16 bitselect (float16 a, float16 b, float16 c)
+{ return __builtin_astype(__builtin_astype(a,int16)^(__builtin_astype(c,int16)&(__builtin_astype(b,int16)^__builtin_astype(a,int16))), float16); }
+
+_CLC_OVERLOAD _CLC_DEF double bitselect (double a, double b, double c)
+{ return __builtin_astype(__builtin_astype(a,long)^(__builtin_astype(c,long)&(__builtin_astype(b,long)^__builtin_astype(a,long))), double); }
+_CLC_OVERLOAD _CLC_DEF double2 bitselect (double2 a, double2 b, double2 c)
+{ return __builtin_astype(__builtin_astype(a,long2)^(__builtin_astype(c,long2)&(__builtin_astype(b,long2)^__builtin_astype(a,long2))), double2); }
+_CLC_OVERLOAD _CLC_DEF double3 bitselect (double3 a, double3 b, double3 c)
+{ return __builtin_astype(__builtin_astype(a,long3)^(__builtin_astype(c,long3)&(__builtin_astype(b,long3)^__builtin_astype(a,long3))), double3); }
+_CLC_OVERLOAD _CLC_DEF double4 bitselect (double4 a, double4 b, double4 c)
+{ return __builtin_astype(__builtin_astype(a,long4)^(__builtin_astype(c,long4)&(__builtin_astype(b,long4)^__builtin_astype(a,long4))), double4); }
+_CLC_OVERLOAD _CLC_DEF double8 bitselect (double8 a, double8 b, double8 c)
+{ return __builtin_astype(__builtin_astype(a,long8)^(__builtin_astype(c,long8)&(__builtin_astype(b,long8)^__builtin_astype(a,long8))), double8); }
+_CLC_OVERLOAD _CLC_DEF double16 bitselect (double16 a, double16 b, double16 c)
+{ return __builtin_astype(__builtin_astype(a,long16)^(__builtin_astype(c,long16)&(__builtin_astype(b,long16)^__builtin_astype(a,long16))), double16); }
diff --git a/src/builtins/clamp.cl b/src/builtins/clamp.cl
new file mode 100644
index 0000000..78a29fb
--- /dev/null
+++ b/src/builtins/clamp.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,2), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, gentype minval, gentype maxval) \
+ { return x > maxval ? maxval : x < minval ? minval : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, sgentype minval, sgentype maxval) \
+ { return x > (gentype)maxval ? (gentype)maxval : x < (gentype)minval ? (gentype)minval : x; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/clz.cl b/src/builtins/clz.cl
new file mode 100644
index 0000000..ac06119
--- /dev/null
+++ b/src/builtins/clz.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+UNARY_VEC_DEF(char, char, clz, clz)
+UNARY_VEC_DEF(uchar, uchar, clz, clz)
+UNARY_VEC_DEF(short, short, clz, clz)
+UNARY_VEC_DEF(ushort, ushort,clz, clz)
+UNARY_VEC_DEF(int, int, clz, clz)
+UNARY_VEC_DEF(uint, uint, clz, clz)
+UNARY_VEC_DEF(long, long, clz, clz)
+UNARY_VEC_DEF(ulong, ulong, clz, clz)
diff --git a/src/builtins/convert.cl b/src/builtins/convert.cl
new file mode 100644
index 0000000..2f47c2d
--- /dev/null
+++ b/src/builtins/convert.cl
@@ -0,0 +1,36122 @@
+/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!!
+
+ DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN:
+ $ ./generate-conversion-type-cl.sh
+
+ OpenCL type conversion functions
+
+ Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+ Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "clc.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define cles_khr_int64
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(char x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(char2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(char4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(char8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(char16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(char3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(char x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(char2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(char4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(char8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(char16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(char3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(uchar x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(uchar2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(uchar4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(uchar8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(uchar16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(uchar3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(uchar x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(uchar2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(uchar4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(uchar8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(uchar16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(uchar3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(short x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(short2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(short4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(short8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(short16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(short3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(short x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(short2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(short4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(short8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(short16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(short3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(ushort x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(ushort2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(ushort4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(ushort8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(ushort16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(ushort3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(ushort x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(ushort2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(ushort4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(ushort8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(ushort16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(ushort3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(int x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(int2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(int4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(int8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(int16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(int3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(int x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(int2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(int4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(int8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(int16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(int3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(uint x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(uint2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(uint4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(uint8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(uint16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(uint3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(uint x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(uint2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(uint4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(uint8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(uint16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(uint3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(long x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(long2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(long4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(long8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(long16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(long3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(long x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(long2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(long4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(long8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(long16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(long3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(ulong x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(ulong2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(ulong4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(ulong8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(ulong16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(ulong3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(ulong x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(ulong2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(ulong4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(ulong8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(ulong16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(ulong3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(float x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(float2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(float4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(float8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(float16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(float3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(float x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(float2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(float4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(float8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(float16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(float3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(float x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(float2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(float4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(float8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(float16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(float3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(float x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(float2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(float4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(float8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(float16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(float3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(float x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(float2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(float4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(float8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(float16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(float3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(float x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(float2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(float4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(float8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(float16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(float3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(float x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(float2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(float4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(float8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(float16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(float3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(float x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(float2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(float4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(float8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(float16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(float3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(float x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(float2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(float4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(float8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(float16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(float3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(float x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(float2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(float4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(float8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(float16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(float3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(double x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(double2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(double4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(double8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(double16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(double3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(double x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(double2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(double4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(double8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(double16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(double3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(double x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(double2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(double4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(double8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(double16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(double3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(double x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(double2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(double4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(double8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(double16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(double3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(double x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(double2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(double4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(double8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(double16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(double3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(double x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(double2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(double4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(double8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(double16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(double3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(double x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(double2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(double4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(double8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(double16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(double3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(double x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(double2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(double4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(double8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(double16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(double3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(double x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(double2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(double4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(double8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(double16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(double3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(double x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(double2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(double4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(double8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(double16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(double3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+
+
+#if 0 // ASW
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+t
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(char x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(char2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(char3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(char4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(char8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(char16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(char x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(char2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(char3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(char4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(char8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(char16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(char x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(char2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(char3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(char4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(char8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(char16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(char x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(char2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(char3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(char4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(char8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(char16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(uchar x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(uchar2 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(uchar3 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(uchar4 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(uchar8 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(uchar16 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(uchar x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(uchar2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(uchar3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(uchar4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(uchar8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(uchar16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(uchar x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(uchar2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(uchar3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(uchar4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(uchar8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(uchar16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(uchar x)
+{
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(uchar2 x)
+{
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(uchar3 x)
+{
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(uchar4 x)
+{
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(uchar8 x)
+{
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(uchar16 x)
+{
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(uchar x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(uchar2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(uchar3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(uchar4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(uchar8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(uchar16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(uchar x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(uchar2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(uchar3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(uchar4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(uchar8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(uchar16 x)
+{
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(uchar x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(uchar2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(uchar3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(uchar4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(uchar8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(uchar16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(uchar x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(uchar2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(uchar3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(uchar4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(uchar8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(uchar16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(short x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(short2 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(short3 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(short4 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(short8 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(short16 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(short x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(short2 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(short3 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(short4 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(short8 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(short16 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(short x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(short2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(short3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(short4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(short8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(short16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(short x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(short2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(short3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(short4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(short8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(short16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(short x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(short2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(short3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(short4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(short8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(short16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(ushort x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(ushort2 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(ushort3 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(ushort4 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(ushort8 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(ushort16 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(ushort x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(ushort2 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(ushort3 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(ushort4 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(ushort8 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(ushort16 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(ushort x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(ushort2 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(ushort3 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(ushort4 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(ushort8 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(ushort16 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(ushort x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(ushort2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(ushort3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(ushort4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(ushort8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(ushort16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(ushort x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(ushort2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(ushort3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(ushort4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(ushort8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(ushort16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(ushort x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(ushort2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(ushort3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(ushort4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(ushort8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(ushort16 x)
+{
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(ushort x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(ushort2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(ushort3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(ushort4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(ushort8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(ushort16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(ushort x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(ushort2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(ushort3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(ushort4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(ushort8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(ushort16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(int x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(int2 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(int3 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(int4 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(int8 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(int16 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(int x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(int2 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(int3 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(int4 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(int8 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(int16 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(int x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(int2 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(int3 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(int4 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(int8 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(int16 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(int x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(int2 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(int3 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(int4 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(int8 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(int16 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(int x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(int2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(int3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(int4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(int8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(int16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(int x)
+{
+ x = max(x, (int)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(int2 x)
+{
+ x = max(x, (int)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(int3 x)
+{
+ x = max(x, (int)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(int4 x)
+{
+ x = max(x, (int)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(int8 x)
+{
+ x = max(x, (int)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(int16 x)
+{
+ x = max(x, (int)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(int x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(int2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(int3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(int4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(int8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(int16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(int x)
+{
+ x = max(x, (int)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(int2 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(int3 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(int4 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(int8 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(int16 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(uint x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(uint2 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(uint3 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(uint4 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(uint8 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(uint16 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(uint x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(uint2 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(uint3 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(uint4 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(uint8 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(uint16 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(uint x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(uint2 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(uint3 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(uint4 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(uint8 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(uint16 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(uint x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(uint2 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(uint3 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(uint4 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(uint8 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(uint16 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(uint x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(uint2 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(uint3 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(uint4 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(uint8 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(uint16 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(uint x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(uint2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(uint3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(uint4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(uint8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(uint16 x)
+{
+ return x;
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(uint x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(uint2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(uint3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(uint4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(uint8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(uint16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(uint x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(uint2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(uint3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(uint4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(uint8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(uint16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(long x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(long2 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(long3 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(long4 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(long8 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(long16 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(long x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(long x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(long2 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(long3 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(long4 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(long8 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(long16 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(long x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(long x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(long2 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(long3 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(long4 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(long8 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(long16 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(long x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(long x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(long2 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(long3 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(long4 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(long8 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(long16 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(long x)
+{
+ x = max(x, (long)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(long2 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(long3 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(long4 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(long8 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(long16 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(ulong x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(ulong2 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(ulong3 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(ulong4 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(ulong8 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(ulong16 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(ulong x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(ulong2 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(ulong3 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(ulong4 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(ulong8 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(ulong16 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(ulong x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(ulong2 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(ulong3 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(ulong4 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(ulong8 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(ulong16 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(ulong x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(ulong2 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(ulong3 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(ulong4 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(ulong8 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(ulong16 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(ulong x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(ulong2 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(ulong3 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(ulong4 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(ulong8 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(ulong16 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(ulong x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(ulong2 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(ulong3 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(ulong4 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(ulong8 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(ulong16 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(ulong x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(ulong2 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(ulong3 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(ulong4 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(ulong8 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(ulong16 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(ulong x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(ulong2 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(ulong3 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(ulong4 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(ulong8 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(ulong16 x)
+{
+ return x;
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(float x)
+{
+ char y = convert_char(x);
+ y = select(y, (char)CHAR_MIN, convert_char(x < (float)CHAR_MIN));
+ y = select(y, (char)CHAR_MAX, convert_char(x > (float)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(float2 x)
+{
+ char2 y = convert_char2(x);
+ y = select(y, (char2)CHAR_MIN, convert_char2(x < (float2)CHAR_MIN));
+ y = select(y, (char2)CHAR_MAX, convert_char2(x > (float2)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(float3 x)
+{
+ char3 y = convert_char3(x);
+ y = select(y, (char3)CHAR_MIN, convert_char3(x < (float3)CHAR_MIN));
+ y = select(y, (char3)CHAR_MAX, convert_char3(x > (float3)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(float4 x)
+{
+ char4 y = convert_char4(x);
+ y = select(y, (char4)CHAR_MIN, convert_char4(x < (float4)CHAR_MIN));
+ y = select(y, (char4)CHAR_MAX, convert_char4(x > (float4)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(float8 x)
+{
+ char8 y = convert_char8(x);
+ y = select(y, (char8)CHAR_MIN, convert_char8(x < (float8)CHAR_MIN));
+ y = select(y, (char8)CHAR_MAX, convert_char8(x > (float8)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(float16 x)
+{
+ char16 y = convert_char16(x);
+ y = select(y, (char16)CHAR_MIN, convert_char16(x < (float16)CHAR_MIN));
+ y = select(y, (char16)CHAR_MAX, convert_char16(x > (float16)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(float x)
+{
+ uchar y = convert_uchar(x);
+ y = select(y, (uchar)0, as_uchar(convert_char(x < (float)0)));
+ y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (float)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(float2 x)
+{
+ uchar2 y = convert_uchar2(x);
+ y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (float2)0)));
+ y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (float2)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(float3 x)
+{
+ uchar3 y = convert_uchar3(x);
+ y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (float3)0)));
+ y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (float3)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(float4 x)
+{
+ uchar4 y = convert_uchar4(x);
+ y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (float4)0)));
+ y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (float4)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(float8 x)
+{
+ uchar8 y = convert_uchar8(x);
+ y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (float8)0)));
+ y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (float8)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(float16 x)
+{
+ uchar16 y = convert_uchar16(x);
+ y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (float16)0)));
+ y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (float16)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(float x)
+{
+ short y = convert_short(x);
+ y = select(y, (short)SHRT_MIN, convert_short(x < (float)SHRT_MIN));
+ y = select(y, (short)SHRT_MAX, convert_short(x > (float)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(float2 x)
+{
+ short2 y = convert_short2(x);
+ y = select(y, (short2)SHRT_MIN, convert_short2(x < (float2)SHRT_MIN));
+ y = select(y, (short2)SHRT_MAX, convert_short2(x > (float2)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(float3 x)
+{
+ short3 y = convert_short3(x);
+ y = select(y, (short3)SHRT_MIN, convert_short3(x < (float3)SHRT_MIN));
+ y = select(y, (short3)SHRT_MAX, convert_short3(x > (float3)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(float4 x)
+{
+ short4 y = convert_short4(x);
+ y = select(y, (short4)SHRT_MIN, convert_short4(x < (float4)SHRT_MIN));
+ y = select(y, (short4)SHRT_MAX, convert_short4(x > (float4)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(float8 x)
+{
+ short8 y = convert_short8(x);
+ y = select(y, (short8)SHRT_MIN, convert_short8(x < (float8)SHRT_MIN));
+ y = select(y, (short8)SHRT_MAX, convert_short8(x > (float8)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(float16 x)
+{
+ short16 y = convert_short16(x);
+ y = select(y, (short16)SHRT_MIN, convert_short16(x < (float16)SHRT_MIN));
+ y = select(y, (short16)SHRT_MAX, convert_short16(x > (float16)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(float x)
+{
+ ushort y = convert_ushort(x);
+ y = select(y, (ushort)0, as_ushort(convert_short(x < (float)0)));
+ y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (float)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(float2 x)
+{
+ ushort2 y = convert_ushort2(x);
+ y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (float2)0)));
+ y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (float2)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(float3 x)
+{
+ ushort3 y = convert_ushort3(x);
+ y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (float3)0)));
+ y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (float3)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(float4 x)
+{
+ ushort4 y = convert_ushort4(x);
+ y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (float4)0)));
+ y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (float4)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(float8 x)
+{
+ ushort8 y = convert_ushort8(x);
+ y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (float8)0)));
+ y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (float8)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(float16 x)
+{
+ ushort16 y = convert_ushort16(x);
+ y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (float16)0)));
+ y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (float16)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(float x)
+{
+ int y = convert_int(x);
+ y = select(y, (int)INT_MIN, convert_int(x < (float)INT_MIN));
+ y = select(y, (int)INT_MAX, convert_int(x > (float)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(float2 x)
+{
+ int2 y = convert_int2(x);
+ y = select(y, (int2)INT_MIN, convert_int2(x < (float2)INT_MIN));
+ y = select(y, (int2)INT_MAX, convert_int2(x > (float2)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(float3 x)
+{
+ int3 y = convert_int3(x);
+ y = select(y, (int3)INT_MIN, convert_int3(x < (float3)INT_MIN));
+ y = select(y, (int3)INT_MAX, convert_int3(x > (float3)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(float4 x)
+{
+ int4 y = convert_int4(x);
+ y = select(y, (int4)INT_MIN, convert_int4(x < (float4)INT_MIN));
+ y = select(y, (int4)INT_MAX, convert_int4(x > (float4)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(float8 x)
+{
+ int8 y = convert_int8(x);
+ y = select(y, (int8)INT_MIN, convert_int8(x < (float8)INT_MIN));
+ y = select(y, (int8)INT_MAX, convert_int8(x > (float8)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(float16 x)
+{
+ int16 y = convert_int16(x);
+ y = select(y, (int16)INT_MIN, convert_int16(x < (float16)INT_MIN));
+ y = select(y, (int16)INT_MAX, convert_int16(x > (float16)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(float x)
+{
+ uint y = convert_uint(x);
+ y = select(y, (uint)0, as_uint(convert_int(x < (float)0)));
+ y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (float)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(float2 x)
+{
+ uint2 y = convert_uint2(x);
+ y = select(y, (uint2)0, as_uint2(convert_int2(x < (float2)0)));
+ y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (float2)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(float3 x)
+{
+ uint3 y = convert_uint3(x);
+ y = select(y, (uint3)0, as_uint3(convert_int3(x < (float3)0)));
+ y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (float3)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(float4 x)
+{
+ uint4 y = convert_uint4(x);
+ y = select(y, (uint4)0, as_uint4(convert_int4(x < (float4)0)));
+ y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (float4)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(float8 x)
+{
+ uint8 y = convert_uint8(x);
+ y = select(y, (uint8)0, as_uint8(convert_int8(x < (float8)0)));
+ y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (float8)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(float16 x)
+{
+ uint16 y = convert_uint16(x);
+ y = select(y, (uint16)0, as_uint16(convert_int16(x < (float16)0)));
+ y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (float16)UINT_MAX)));
+ return y;
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(float x)
+{
+ long y = convert_long(x);
+ y = select(y, (long)LONG_MIN, convert_long(x < (float)LONG_MIN));
+ y = select(y, (long)LONG_MAX, convert_long(x > (float)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(float2 x)
+{
+ long2 y = convert_long2(x);
+ y = select(y, (long2)LONG_MIN, convert_long2(x < (float2)LONG_MIN));
+ y = select(y, (long2)LONG_MAX, convert_long2(x > (float2)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(float3 x)
+{
+ long3 y = convert_long3(x);
+ y = select(y, (long3)LONG_MIN, convert_long3(x < (float3)LONG_MIN));
+ y = select(y, (long3)LONG_MAX, convert_long3(x > (float3)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(float4 x)
+{
+ long4 y = convert_long4(x);
+ y = select(y, (long4)LONG_MIN, convert_long4(x < (float4)LONG_MIN));
+ y = select(y, (long4)LONG_MAX, convert_long4(x > (float4)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(float8 x)
+{
+ long8 y = convert_long8(x);
+ y = select(y, (long8)LONG_MIN, convert_long8(x < (float8)LONG_MIN));
+ y = select(y, (long8)LONG_MAX, convert_long8(x > (float8)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(float16 x)
+{
+ long16 y = convert_long16(x);
+ y = select(y, (long16)LONG_MIN, convert_long16(x < (float16)LONG_MIN));
+ y = select(y, (long16)LONG_MAX, convert_long16(x > (float16)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(float x)
+{
+ ulong y = convert_ulong(x);
+ y = select(y, (ulong)0, as_ulong(convert_long(x < (float)0)));
+ y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (float)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(float2 x)
+{
+ ulong2 y = convert_ulong2(x);
+ y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (float2)0)));
+ y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (float2)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(float3 x)
+{
+ ulong3 y = convert_ulong3(x);
+ y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (float3)0)));
+ y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (float3)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(float4 x)
+{
+ ulong4 y = convert_ulong4(x);
+ y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (float4)0)));
+ y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (float4)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(float8 x)
+{
+ ulong8 y = convert_ulong8(x);
+ y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (float8)0)));
+ y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (float8)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(float16 x)
+{
+ ulong16 y = convert_ulong16(x);
+ y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (float16)0)));
+ y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (float16)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(double x)
+{
+ char y = convert_char(x);
+ y = select(y, (char)CHAR_MIN, convert_char(x < (double)CHAR_MIN));
+ y = select(y, (char)CHAR_MAX, convert_char(x > (double)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(double2 x)
+{
+ char2 y = convert_char2(x);
+ y = select(y, (char2)CHAR_MIN, convert_char2(x < (double2)CHAR_MIN));
+ y = select(y, (char2)CHAR_MAX, convert_char2(x > (double2)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(double3 x)
+{
+ char3 y = convert_char3(x);
+ y = select(y, (char3)CHAR_MIN, convert_char3(x < (double3)CHAR_MIN));
+ y = select(y, (char3)CHAR_MAX, convert_char3(x > (double3)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(double4 x)
+{
+ char4 y = convert_char4(x);
+ y = select(y, (char4)CHAR_MIN, convert_char4(x < (double4)CHAR_MIN));
+ y = select(y, (char4)CHAR_MAX, convert_char4(x > (double4)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(double8 x)
+{
+ char8 y = convert_char8(x);
+ y = select(y, (char8)CHAR_MIN, convert_char8(x < (double8)CHAR_MIN));
+ y = select(y, (char8)CHAR_MAX, convert_char8(x > (double8)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(double16 x)
+{
+ char16 y = convert_char16(x);
+ y = select(y, (char16)CHAR_MIN, convert_char16(x < (double16)CHAR_MIN));
+ y = select(y, (char16)CHAR_MAX, convert_char16(x > (double16)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(double x)
+{
+ uchar y = convert_uchar(x);
+ y = select(y, (uchar)0, as_uchar(convert_char(x < (double)0)));
+ y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (double)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(double2 x)
+{
+ uchar2 y = convert_uchar2(x);
+ y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (double2)0)));
+ y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (double2)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(double3 x)
+{
+ uchar3 y = convert_uchar3(x);
+ y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (double3)0)));
+ y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (double3)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(double4 x)
+{
+ uchar4 y = convert_uchar4(x);
+ y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (double4)0)));
+ y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (double4)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(double8 x)
+{
+ uchar8 y = convert_uchar8(x);
+ y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (double8)0)));
+ y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (double8)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(double16 x)
+{
+ uchar16 y = convert_uchar16(x);
+ y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (double16)0)));
+ y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (double16)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(double x)
+{
+ short y = convert_short(x);
+ y = select(y, (short)SHRT_MIN, convert_short(x < (double)SHRT_MIN));
+ y = select(y, (short)SHRT_MAX, convert_short(x > (double)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(double2 x)
+{
+ short2 y = convert_short2(x);
+ y = select(y, (short2)SHRT_MIN, convert_short2(x < (double2)SHRT_MIN));
+ y = select(y, (short2)SHRT_MAX, convert_short2(x > (double2)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(double3 x)
+{
+ short3 y = convert_short3(x);
+ y = select(y, (short3)SHRT_MIN, convert_short3(x < (double3)SHRT_MIN));
+ y = select(y, (short3)SHRT_MAX, convert_short3(x > (double3)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(double4 x)
+{
+ short4 y = convert_short4(x);
+ y = select(y, (short4)SHRT_MIN, convert_short4(x < (double4)SHRT_MIN));
+ y = select(y, (short4)SHRT_MAX, convert_short4(x > (double4)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(double8 x)
+{
+ short8 y = convert_short8(x);
+ y = select(y, (short8)SHRT_MIN, convert_short8(x < (double8)SHRT_MIN));
+ y = select(y, (short8)SHRT_MAX, convert_short8(x > (double8)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(double16 x)
+{
+ short16 y = convert_short16(x);
+ y = select(y, (short16)SHRT_MIN, convert_short16(x < (double16)SHRT_MIN));
+ y = select(y, (short16)SHRT_MAX, convert_short16(x > (double16)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(double x)
+{
+ ushort y = convert_ushort(x);
+ y = select(y, (ushort)0, as_ushort(convert_short(x < (double)0)));
+ y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (double)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(double2 x)
+{
+ ushort2 y = convert_ushort2(x);
+ y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (double2)0)));
+ y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (double2)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(double3 x)
+{
+ ushort3 y = convert_ushort3(x);
+ y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (double3)0)));
+ y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (double3)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(double4 x)
+{
+ ushort4 y = convert_ushort4(x);
+ y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (double4)0)));
+ y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (double4)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(double8 x)
+{
+ ushort8 y = convert_ushort8(x);
+ y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (double8)0)));
+ y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (double8)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(double16 x)
+{
+ ushort16 y = convert_ushort16(x);
+ y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (double16)0)));
+ y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (double16)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(double x)
+{
+ int y = convert_int(x);
+ y = select(y, (int)INT_MIN, convert_int(x < (double)INT_MIN));
+ y = select(y, (int)INT_MAX, convert_int(x > (double)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(double2 x)
+{
+ int2 y = convert_int2(x);
+ y = select(y, (int2)INT_MIN, convert_int2(x < (double2)INT_MIN));
+ y = select(y, (int2)INT_MAX, convert_int2(x > (double2)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(double3 x)
+{
+ int3 y = convert_int3(x);
+ y = select(y, (int3)INT_MIN, convert_int3(x < (double3)INT_MIN));
+ y = select(y, (int3)INT_MAX, convert_int3(x > (double3)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(double4 x)
+{
+ int4 y = convert_int4(x);
+ y = select(y, (int4)INT_MIN, convert_int4(x < (double4)INT_MIN));
+ y = select(y, (int4)INT_MAX, convert_int4(x > (double4)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(double8 x)
+{
+ int8 y = convert_int8(x);
+ y = select(y, (int8)INT_MIN, convert_int8(x < (double8)INT_MIN));
+ y = select(y, (int8)INT_MAX, convert_int8(x > (double8)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(double16 x)
+{
+ int16 y = convert_int16(x);
+ y = select(y, (int16)INT_MIN, convert_int16(x < (double16)INT_MIN));
+ y = select(y, (int16)INT_MAX, convert_int16(x > (double16)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(double x)
+{
+ uint y = convert_uint(x);
+ y = select(y, (uint)0, as_uint(convert_int(x < (double)0)));
+ y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (double)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(double2 x)
+{
+ uint2 y = convert_uint2(x);
+ y = select(y, (uint2)0, as_uint2(convert_int2(x < (double2)0)));
+ y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (double2)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(double3 x)
+{
+ uint3 y = convert_uint3(x);
+ y = select(y, (uint3)0, as_uint3(convert_int3(x < (double3)0)));
+ y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (double3)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(double4 x)
+{
+ uint4 y = convert_uint4(x);
+ y = select(y, (uint4)0, as_uint4(convert_int4(x < (double4)0)));
+ y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (double4)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(double8 x)
+{
+ uint8 y = convert_uint8(x);
+ y = select(y, (uint8)0, as_uint8(convert_int8(x < (double8)0)));
+ y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (double8)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(double16 x)
+{
+ uint16 y = convert_uint16(x);
+ y = select(y, (uint16)0, as_uint16(convert_int16(x < (double16)0)));
+ y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (double16)UINT_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(double x)
+{
+ long y = convert_long(x);
+ y = select(y, (long)LONG_MIN, convert_long(x < (double)LONG_MIN));
+ y = select(y, (long)LONG_MAX, convert_long(x > (double)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(double2 x)
+{
+ long2 y = convert_long2(x);
+ y = select(y, (long2)LONG_MIN, convert_long2(x < (double2)LONG_MIN));
+ y = select(y, (long2)LONG_MAX, convert_long2(x > (double2)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(double3 x)
+{
+ long3 y = convert_long3(x);
+ y = select(y, (long3)LONG_MIN, convert_long3(x < (double3)LONG_MIN));
+ y = select(y, (long3)LONG_MAX, convert_long3(x > (double3)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(double4 x)
+{
+ long4 y = convert_long4(x);
+ y = select(y, (long4)LONG_MIN, convert_long4(x < (double4)LONG_MIN));
+ y = select(y, (long4)LONG_MAX, convert_long4(x > (double4)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(double8 x)
+{
+ long8 y = convert_long8(x);
+ y = select(y, (long8)LONG_MIN, convert_long8(x < (double8)LONG_MIN));
+ y = select(y, (long8)LONG_MAX, convert_long8(x > (double8)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(double16 x)
+{
+ long16 y = convert_long16(x);
+ y = select(y, (long16)LONG_MIN, convert_long16(x < (double16)LONG_MIN));
+ y = select(y, (long16)LONG_MAX, convert_long16(x > (double16)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(double x)
+{
+ ulong y = convert_ulong(x);
+ y = select(y, (ulong)0, as_ulong(convert_long(x < (double)0)));
+ y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (double)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(double2 x)
+{
+ ulong2 y = convert_ulong2(x);
+ y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (double2)0)));
+ y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (double2)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(double3 x)
+{
+ ulong3 y = convert_ulong3(x);
+ y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (double3)0)));
+ y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (double3)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(double4 x)
+{
+ ulong4 y = convert_ulong4(x);
+ y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (double4)0)));
+ y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (double4)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(double8 x)
+{
+ ulong8 y = convert_ulong8(x);
+ y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (double8)0)));
+ y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (double8)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(double16 x)
+{
+ ulong16 y = convert_ulong16(x);
+ y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (double16)0)));
+ y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (double16)ULONG_MAX)));
+ return y;
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(float x)
+{
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(float x)
+{
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(float x)
+{
+ x = rint(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(float x)
+{
+ x = ceil(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(float x)
+{
+ x = floor(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(float2 x)
+{
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(float2 x)
+{
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(float3 x)
+{
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(float3 x)
+{
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(float4 x)
+{
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(float4 x)
+{
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(float8 x)
+{
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(float8 x)
+{
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(float16 x)
+{
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(float16 x)
+{
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(float x)
+{
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(float x)
+{
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(float x)
+{
+ x = rint(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(float x)
+{
+ x = floor(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(float2 x)
+{
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(float2 x)
+{
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(float3 x)
+{
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(float3 x)
+{
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(float4 x)
+{
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(float4 x)
+{
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(float8 x)
+{
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(float8 x)
+{
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(float16 x)
+{
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(float16 x)
+{
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(float x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(float x)
+{
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(float x)
+{
+ x = rint(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(float x)
+{
+ x = ceil(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(float x)
+{
+ x = floor(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(float2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(float2 x)
+{
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(float3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(float3 x)
+{
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(float4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(float4 x)
+{
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(float8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(float8 x)
+{
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(float16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(float16 x)
+{
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(float x)
+{
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(float x)
+{
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(float x)
+{
+ x = rint(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(float x)
+{
+ x = floor(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(float2 x)
+{
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(float2 x)
+{
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(float3 x)
+{
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(float3 x)
+{
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(float4 x)
+{
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(float4 x)
+{
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(float8 x)
+{
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(float8 x)
+{
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(float16 x)
+{
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(float16 x)
+{
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(float x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(float x)
+{
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(float x)
+{
+ x = rint(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(float x)
+{
+ x = ceil(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(float x)
+{
+ x = floor(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(float2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(float2 x)
+{
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(float3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(float3 x)
+{
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(float4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(float4 x)
+{
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(float8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(float8 x)
+{
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(float16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(float16 x)
+{
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(float x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(float x)
+{
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(float x)
+{
+ x = rint(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(float x)
+{
+ x = floor(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(float2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(float2 x)
+{
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(float3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(float3 x)
+{
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(float4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(float4 x)
+{
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(float8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(float8 x)
+{
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(float16 x)
+{
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(float16 x)
+{
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uint16_sat(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(float x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(float x)
+{
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(float x)
+{
+ x = rint(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(float x)
+{
+ x = ceil(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(float x)
+{
+ x = floor(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(float2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(float2 x)
+{
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(float3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(float3 x)
+{
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(float4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(float4 x)
+{
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(float8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(float8 x)
+{
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(float16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(float16 x)
+{
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(float x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(float x)
+{
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(float x)
+{
+ x = rint(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(float x)
+{
+ x = floor(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(float2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(float2 x)
+{
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(float3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(float3 x)
+{
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(float4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(float4 x)
+{
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(float8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(float8 x)
+{
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(float16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(float16 x)
+{
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(double x)
+{
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(double x)
+{
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(double x)
+{
+ x = rint(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(double x)
+{
+ x = ceil(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(double x)
+{
+ x = floor(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(double2 x)
+{
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(double2 x)
+{
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(double3 x)
+{
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(double3 x)
+{
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(double4 x)
+{
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(double4 x)
+{
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(double8 x)
+{
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(double8 x)
+{
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(double16 x)
+{
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(double16 x)
+{
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(double x)
+{
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(double x)
+{
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(double x)
+{
+ x = rint(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(double x)
+{
+ x = floor(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(double2 x)
+{
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(double2 x)
+{
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(double3 x)
+{
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(double3 x)
+{
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(double4 x)
+{
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(double4 x)
+{
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(double8 x)
+{
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(double8 x)
+{
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(double16 x)
+{
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(double16 x)
+{
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(double x)
+{
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(double x)
+{
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(double x)
+{
+ x = rint(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(double x)
+{
+ x = ceil(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(double x)
+{
+ x = floor(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(double2 x)
+{
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(double2 x)
+{
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(double3 x)
+{
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(double3 x)
+{
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(double4 x)
+{
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(double4 x)
+{
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(double8 x)
+{
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(double8 x)
+{
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(double16 x)
+{
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(double16 x)
+{
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(double x)
+{
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(double x)
+{
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(double x)
+{
+ x = rint(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(double x)
+{
+ x = floor(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(double2 x)
+{
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(double2 x)
+{
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(double3 x)
+{
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(double3 x)
+{
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(double4 x)
+{
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(double4 x)
+{
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(double8 x)
+{
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(double8 x)
+{
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(double16 x)
+{
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(double16 x)
+{
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(double x)
+{
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(double x)
+{
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(double x)
+{
+ x = rint(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(double x)
+{
+ x = ceil(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(double x)
+{
+ x = floor(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(double2 x)
+{
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(double2 x)
+{
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(double3 x)
+{
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(double3 x)
+{
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(double4 x)
+{
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(double4 x)
+{
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(double8 x)
+{
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(double8 x)
+{
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(double16 x)
+{
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(double16 x)
+{
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(double x)
+{
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(double x)
+{
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(double x)
+{
+ x = rint(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(double x)
+{
+ x = floor(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(double2 x)
+{
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(double2 x)
+{
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(double3 x)
+{
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(double3 x)
+{
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(double4 x)
+{
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(double4 x)
+{
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(double8 x)
+{
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(double8 x)
+{
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(double16 x)
+{
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(double16 x)
+{
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(double x)
+{
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(double x)
+{
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(double x)
+{
+ x = rint(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(double x)
+{
+ x = ceil(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(double x)
+{
+ x = floor(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(double2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(double2 x)
+{
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(double3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(double3 x)
+{
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(double4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(double4 x)
+{
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(double8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(double8 x)
+{
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(double16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(double16 x)
+{
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(double x)
+{
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(double x)
+{
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(double x)
+{
+ x = rint(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(double x)
+{
+ x = floor(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(double2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(double2 x)
+{
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(double3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(double3 x)
+{
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(double4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(double4 x)
+{
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(double8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(double8 x)
+{
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(double16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(double16 x)
+{
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(char x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(char2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(char3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(char4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(char8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(char16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(char x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(char2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(char3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(char4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(char8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(char16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(uchar x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(uchar2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(uchar3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(uchar4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(uchar8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(uchar16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(uchar x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(uchar2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(uchar3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(uchar4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(uchar8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(uchar16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(short x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(short2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(short3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(short4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(short8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(short16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(short x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(short2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(short3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(short4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(short8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(short16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(ushort x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(ushort2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(ushort3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(ushort4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(ushort8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(ushort16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(ushort x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(ushort2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(ushort3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(ushort4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(ushort8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(ushort16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(int x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(int2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(int3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(int4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(int8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(int16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(int x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(int2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(int3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(int4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(int8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(int16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(uint x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(uint2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(uint3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(uint4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(uint8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(uint16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(uint x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(uint2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(uint3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(uint4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(uint8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(uint16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(long x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(long2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(long3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(long4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(long8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(long16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(long x)
+{
+ return convert_double(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(long2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(long3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(long4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(long8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(long16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(ulong x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(ulong2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(ulong3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(ulong4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(ulong8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(ulong16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(ulong x)
+{
+ return convert_double(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(ulong2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(ulong3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(ulong4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(ulong8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(ulong16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ float abs_x = fabs(x);
+ float abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(float x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ float2 abs_x = fabs(x);
+ float2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(float2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ float3 abs_x = fabs(x);
+ float3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(float3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ float4 abs_x = fabs(x);
+ float4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(float4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ float8 abs_x = fabs(x);
+ float8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(float8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ float16 abs_x = fabs(x);
+ float16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(float16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ float abs_x = fabs(x);
+ float abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(float x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ float2 abs_x = fabs(x);
+ float2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(float2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ float3 abs_x = fabs(x);
+ float3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(float3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ float4 abs_x = fabs(x);
+ float4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(float4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ float8 abs_x = fabs(x);
+ float8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(float8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ float16 abs_x = fabs(x);
+ float16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(float16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ double abs_x = fabs(x);
+ double abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(double x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ double2 abs_x = fabs(x);
+ double2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(double2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ double3 abs_x = fabs(x);
+ double3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(double3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ double4 abs_x = fabs(x);
+ double4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(double4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ double8 abs_x = fabs(x);
+ double8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(double8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ double16 abs_x = fabs(x);
+ double16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(double16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ double abs_x = fabs(x);
+ double abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(double x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ double2 abs_x = fabs(x);
+ double2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(double2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ double3 abs_x = fabs(x);
+ double3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(double3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ double4 abs_x = fabs(x);
+ double4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(double4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ double8 abs_x = fabs(x);
+ double8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(double8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ double16 abs_x = fabs(x);
+ double16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(double16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+
+
+#endif // ASW
diff --git a/src/builtins/cross.cl b/src/builtins/cross.cl
new file mode 100644
index 0000000..a3e019f
--- /dev/null
+++ b/src/builtins/cross.cl
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011-2013, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float3 cross(float3 p0, float3 p1)
+{
+ return (float3)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 cross(float4 p0, float4 p1)
+{
+ return (float4)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x,
+ 0.f);
+}
+
+_CLC_OVERLOAD _CLC_DEF double3 cross(double3 p0, double3 p1)
+{
+ return (double3)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF double4 cross(double4 p0, double4 p1)
+{
+ return (double4)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x,
+ 0.);
+}
diff --git a/src/builtins/degrees.cl b/src/builtins/degrees.cl
new file mode 100644
index 0000000..329e0f1
--- /dev/null
+++ b/src/builtins/degrees.cl
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3)) \
+ IMPLEMENTATION(_VEC_TYPE(type,4)) \
+ IMPLEMENTATION(_VEC_TYPE(type,8)) \
+ IMPLEMENTATION(_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_DEF gentype degrees(gentype radians) { return radians * (gentype)180.0 * (gentype)M_1_PI; } \
+_CLC_OVERLOAD _CLC_DEF gentype radians(gentype degrees) { return degrees * (gentype)M_PI / (gentype)180.0; }
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/dot.cl b/src/builtins/dot.cl
new file mode 100644
index 0000000..0b16d66
--- /dev/null
+++ b/src/builtins/dot.cl
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; }
+
+_CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; }
+
+_CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; }
+
+_CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; }
+
diff --git a/src/builtins/fract.cl b/src/builtins/fract.cl
new file mode 100644
index 0000000..11f08e8
--- /dev/null
+++ b/src/builtins/fract.cl
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define SCALAR(type, the_max) \
+{ \
+ type the_floor = floor(x); \
+ *ptr = the_floor; \
+ if (isnan(x)) return x; \
+ return fmin(x - the_floor, (type) (the_max)); \
+} \
+
+#define BODY(type, the_max) \
+{ \
+ type the_floor = floor(x); \
+ *ptr = the_floor; \
+ type result = fmin(x - the_floor, (type) (the_max)); \
+ return select(result, x, isnan(x)); \
+} \
+
+_CLC_OVERLOAD _CLC_DEF float fract(float x, global float * ptr) SCALAR(float, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float fract(float x, local float * ptr) SCALAR(float, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float fract(float x, private float * ptr) SCALAR(float, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, global float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, local float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, private float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, global float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, local float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, private float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, global float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, local float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, private float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, global float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, local float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, private float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, global float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, local float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, private float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF double fract(double x, global double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double fract(double x, local double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double fract(double x, private double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, global double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, local double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, private double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, global double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, local double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, private double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, global double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, local double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, private double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, global double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, local double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, private double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, global double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, local double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, private double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+
diff --git a/src/builtins/frexp.cl b/src/builtins/frexp.cl
new file mode 100644
index 0000000..e02cf90
--- /dev/null
+++ b/src/builtins/frexp.cl
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, global int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, local int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, private int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, global int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, local int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, private int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
diff --git a/src/builtins/hadd.cl b/src/builtins/hadd.cl
new file mode 100644
index 0000000..c96324f
--- /dev/null
+++ b/src/builtins/hadd.cl
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ IMPLEMENTATION(_VEC_TYPE(type,3)) \
+ IMPLEMENTATION(_VEC_TYPE(type,4)) \
+ IMPLEMENTATION(_VEC_TYPE(type,8)) \
+ IMPLEMENTATION(_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+ _CLC_OVERLOAD _CLC_DEF gentype hadd(gentype x, gentype y) \
+ { return (x >> (gentype)1) + (y >> (gentype)1) + (x & y & (gentype)1); } \
+ _CLC_OVERLOAD _CLC_DEF gentype rhadd(gentype x, gentype y) \
+ { return (x >> (gentype)1) + (y >> (gentype)1) + ((x&(gentype)1)|(y&(gentype)1)); } \
+
+_EXPAND_INTEGER_TYPES()
diff --git a/src/builtins/length.cl b/src/builtins/length.cl
new file mode 100644
index 0000000..2cfefa1
--- /dev/null
+++ b/src/builtins/length.cl
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float length(float2 p)
+{
+ float r;
+ p = fabs(p);
+ if (p.x > p.y)
+ {
+ r = p.y/p.x;
+ return p.x * sqrt(1+r*r);
+ }
+ else if (p.y != 0)
+ {
+ r = p.x/p.y;
+ return p.y * sqrt(1+r*r);
+ }
+ return 0.0;
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double2 p)
+{
+ double r;
+ p = fabs(p);
+ if (p.x > p.y)
+ {
+ r = p.y/p.x;
+ return p.x * sqrt(1+r*r);
+ }
+ else if (p.y != 0)
+ {
+ r = p.x/p.y;
+ return p.y * sqrt(1+r*r);
+ }
+ return 0.0;
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float3 p)
+{
+ p = fabs(p);
+ float max_term = max(p.x, max(p.y, p.z));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double3 p)
+{
+ p = fabs(p);
+ double max_term = max(p.x, max(p.y, p.z));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float4 p)
+{
+ p = fabs(p);
+ float max_term = max(max(p.x, p.y), max(p.z, p.w));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double4 p)
+{
+ p = fabs(p);
+ double max_term = max(max(p.x, p.y), max(p.z, p.w));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float2 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF float fast_length(float3 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF float fast_length(float4 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double2 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double3 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double4 p) { return sqrt(dot(p,p));}
diff --git a/src/builtins/lgamma_r.cl b/src/builtins/lgamma_r.cl
new file mode 100644
index 0000000..aa3d487
--- /dev/null
+++ b/src/builtins/lgamma_r.cl
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED float lgammaf_r(float x, int * ptr);
+_CLC_PROTECTED double builtin_lgamma_r(double x, int * ptr);
+
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, global int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, local int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, private int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, global int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, local int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, private int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+
diff --git a/src/builtins/mad_sat.cl b/src/builtins/mad_sat.cl
new file mode 100644
index 0000000..ac79a86
--- /dev/null
+++ b/src/builtins/mad_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+TERNARY_VEC_DEF(char, char, mad_sat, mad_sat)
+TERNARY_VEC_DEF(uchar, uchar, mad_sat, mad_sat)
+TERNARY_VEC_DEF(short, short, mad_sat, mad_sat)
+TERNARY_VEC_DEF(ushort, ushort,mad_sat, mad_sat)
+TERNARY_VEC_DEF(int, int, mad_sat, mad_sat)
+TERNARY_VEC_DEF(uint, uint, mad_sat, mad_sat)
+TERNARY_VEC_DEF(long, long, mad_sat, mad_sat)
+TERNARY_VEC_DEF(ulong, ulong, mad_sat, mad_sat)
diff --git a/src/builtins/math.cl b/src/builtins/math.cl
new file mode 100644
index 0000000..02db08b
--- /dev/null
+++ b/src/builtins/math.cl
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define UNARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x) { return (float)__builtin_##function(x); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x) { return __builtin_##function(x); } \
+UNARY_VEC_DEF(float, float, function, function##f) \
+UNARY_VEC_DEF(double, double, function, function##d) \
+
+#define UNARY_ALT(utype, function) \
+_CLC_PROTECTED _CLC_INLINE utype function##f(float x) { return __builtin_##function(x); } \
+_CLC_PROTECTED _CLC_INLINE utype function##d(double x) { return __builtin_##function(x); } \
+UNARY_VEC_DEF(float, utype, function, function##f) \
+UNARY_VEC_DEF(double, utype, function, function##d) \
+
+#define UNARY_NO_BUILTIN(function) \
+UNARY_VEC_DEF(float, float, function, function) \
+UNARY_VEC_DEF(double, double, function, function) \
+
+#define BINARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y) { return (float)__builtin_##function(x,y); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y) { return __builtin_##function(x,y); } \
+BINARY_VEC_DEF(float, float, function, function) \
+BINARY_VEC_DEF(double, double, function, function) \
+
+#define BINARY_NO_BUILTIN(function) \
+BINARY_VEC_DEF(float, float, function, function) \
+BINARY_VEC_DEF(double, double, function, function) \
+
+#define TERNARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y, float z) { return (float)__builtin_##function(x,y,z); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y, double z) { return __builtin_##function(x,y,z); } \
+TERNARY_VEC_DEF(float, float, function, function) \
+TERNARY_VEC_DEF(double, double, function, function) \
+
+#define TERNARY_NO_BUILTIN(function) \
+TERNARY_VEC_DEF(float, float, function, function) \
+TERNARY_VEC_DEF(double, double, function, function) \
+
+/*-------------------------------------------------------------------------
+* Prototypes for the math builtins
+*------------------------------------------------------------------------*/
+UNARY(acos)
+UNARY(acosh)
+UNARY_NO_BUILTIN(acospi)
+UNARY(asin)
+UNARY(asinh)
+UNARY_NO_BUILTIN(asinpi)
+UNARY(atan)
+BINARY_NO_BUILTIN(atan2pi)
+UNARY(atanh)
+UNARY_NO_BUILTIN(atanpi)
+BINARY(atan2)
+UNARY(cbrt)
+UNARY(ceil)
+UNARY(cos)
+BINARY(copysign)
+UNARY(cosh)
+UNARY_NO_BUILTIN(cospi)
+UNARY(erf)
+UNARY(erfc)
+UNARY(exp)
+UNARY(exp2)
+UNARY_NO_BUILTIN(exp10)
+UNARY(expm1)
+UNARY(fabs)
+BINARY(fdim)
+UNARY(floor)
+TERNARY(fma)
+BINARY(fmax)
+BINARY(fmin)
+BINARY(fmod)
+BINARY(hypot)
+
+UNARY_ALT(int, ilogb)
+
+BINARY_VEC_DEF_ALT(float, float, int, ldexp, ldexpf)
+BINARY_VEC_DEF_ALT(double, double, int, ldexp, ldexp)
+
+UNARY(lgamma)
+UNARY(log)
+UNARY(log2)
+UNARY(log10)
+UNARY(log1p)
+UNARY(logb)
+TERNARY_NO_BUILTIN(mad)
+BINARY_NO_BUILTIN(maxmag)
+BINARY_NO_BUILTIN(minmag)
+
+UNARY_VEC_DEF(uint, float, nan, nan)
+UNARY_VEC_DEF(ulong, double, nan, nan)
+
+BINARY(nextafter)
+BINARY(pow)
+
+BINARY_VEC_DEF_ALT(float, float, int, pown, powf)
+BINARY_VEC_DEF_ALT(double, double, int, pown, builtin_pow)
+
+BINARY_NO_BUILTIN(powr)
+BINARY(remainder)
+UNARY(rint)
+
+BINARY_VEC_DEF_ALT(float, float, int, rootn, builtin_rootnf)
+BINARY_VEC_DEF_ALT(double, double, int, rootn, builtin_rootn)
+
+UNARY(round)
+UNARY_NO_BUILTIN(rsqrt)
+UNARY(sin)
+UNARY(sinh)
+UNARY_NO_BUILTIN(sinpi)
+UNARY(sqrt)
+UNARY(tan)
+UNARY(tanh)
+UNARY_NO_BUILTIN(tanpi)
+UNARY(tgamma)
+UNARY(trunc)
+
+/*-------------------------------------------------------------------------
+* Half functions:
+*------------------------------------------------------------------------*/
+
+BINARY_NO_BUILTIN(half_divide)
+UNARY_NO_BUILTIN(half_recip)
+
+
diff --git a/src/builtins/max.cl b/src/builtins/max.cl
new file mode 100644
index 0000000..9605490
--- /dev/null
+++ b/src/builtins/max.cl
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, sgentype y) \
+ { return (gentype)y < x ? (gentype)y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, sgentype y) \
+ { return (gentype)y > x ? (gentype)y : x; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/misc.cl b/src/builtins/misc.cl
new file mode 100644
index 0000000..aba5efa
--- /dev/null
+++ b/src/builtins/misc.cl
@@ -0,0 +1,36 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+_CLC_PROTECTED void __mfence(void);
+//_CLC_PROTECTED void barrier(cl_mem_fence_flags flags) { }
+_CLC_PROTECTED void mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+_CLC_PROTECTED void read_mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+_CLC_PROTECTED void write_mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+
diff --git a/src/builtins/mix.cl b/src/builtins/mix.cl
new file mode 100644
index 0000000..9f339aa
--- /dev/null
+++ b/src/builtins/mix.cl
@@ -0,0 +1,42 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, sgentype a) \
+ { return x + (y-x) * (gentype)a; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/modf.cl b/src/builtins/modf.cl
new file mode 100644
index 0000000..cf0aae7
--- /dev/null
+++ b/src/builtins/modf.cl
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED float modff(float x, float * iptr);
+_CLC_PROTECTED double builtin_modf(double x, double * iptr);
+
+
+_CLC_OVERLOAD _CLC_DEF float modf(float x, global float * ptr) SCALAR_BODY(float, modff, float)
+_CLC_OVERLOAD _CLC_DEF float modf(float x, local float * ptr) SCALAR_BODY(float, modff, float)
+_CLC_OVERLOAD _CLC_DEF float modf(float x, private float * ptr) SCALAR_BODY(float, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, global float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, local float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, private float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, global float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, local float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, private float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, global float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, local float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, private float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, global float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, local float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, private float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, global float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, local float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, private float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF double modf(double x, global double * ptr) SCALAR_BODY(double, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double modf(double x, local double * ptr) SCALAR_BODY(double, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double modf(double x, private double * ptr) SCALAR_BODY(double, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, global double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, local double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, private double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, global double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, local double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, private double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, global double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, local double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, private double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, global double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, local double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, private double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, global double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, local double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, private double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+
diff --git a/src/builtins/mul_hi.cl b/src/builtins/mul_hi.cl
new file mode 100644
index 0000000..5b3368e
--- /dev/null
+++ b/src/builtins/mul_hi.cl
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+//FOIL-based long mul_hi
+//
+// Summary: Treat mul_hi(long x, long y) as:
+// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
+// and b and d are the low-order parts of x and y.
+// Thinking back to algebra, we use FOIL to do the work.
+
+_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
+ long f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ long x_hi = x >> 32;
+ long x_lo = x & UINT_MAX;
+ long y_hi = y >> 32;
+ long y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together in the following steps:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y)
+{
+ ulong f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ ulong x_hi = x >> 32;
+ ulong x_lo = x & UINT_MAX;
+ ulong y_hi = y >> 32;
+ ulong y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together, taking care to respect the fact that:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (f + (hadd(o, (i + (l>>32))) >> 31));
+}
+
+BINARY_VEC_DEF(char, char, mul_hi, mul_hi)
+BINARY_VEC_DEF(uchar, uchar, mul_hi, mul_hi)
+BINARY_VEC_DEF(short, short, mul_hi, mul_hi)
+BINARY_VEC_DEF(ushort, ushort,mul_hi, mul_hi)
+BINARY_VEC_DEF(int, int, mul_hi, mul_hi)
+BINARY_VEC_DEF(uint, uint, mul_hi, mul_hi)
+BINARY_VEC_DEF(long, long, mul_hi, mul_hi)
+BINARY_VEC_DEF(ulong, ulong, mul_hi, mul_hi)
diff --git a/src/builtins/relationals.cl b/src/builtins/relationals.cl
new file mode 100644
index 0000000..a1d6830
--- /dev/null
+++ b/src/builtins/relationals.cl
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+UNARY_VEC_DEF(float, int, isnan, -isnan)
+UNARY_VEC_DEF(double, long, isnan, -isnan)
+
+UNARY_VEC_DEF(float, int, isfinite, -isfinite)
+UNARY_VEC_DEF(double, long, isfinite, -isfinite)
+
+UNARY_VEC_DEF(float, int, isinf, -isinf)
+UNARY_VEC_DEF(double, long, isinf, -isinf)
+
+UNARY_VEC_DEF(float, int, isnormal, -isnormal)
+UNARY_VEC_DEF(double, long, isnormal, -isnormal)
+
+UNARY_VEC_DEF(float, int, signbit, -signbit)
+UNARY_VEC_DEF(double, long, signbit, -signbit)
+
+BINARY_VEC_DEF(float, int, isequal, -isequal)
+BINARY_VEC_DEF(double, long, isequal, -isequal)
+
+BINARY_VEC_DEF(float, int, isnotequal, -isnotequal)
+BINARY_VEC_DEF(double, long, isnotequal, -isnotequal)
+
+BINARY_VEC_DEF(float, int, isless, -isless)
+BINARY_VEC_DEF(double, long, isless, -isless)
+
+BINARY_VEC_DEF(float, int, islessequal, -islessequal)
+BINARY_VEC_DEF(double, long, islessequal, -islessequal)
+
+BINARY_VEC_DEF(float, int, isgreater, -isgreater)
+BINARY_VEC_DEF(double, long, isgreater, -isgreater)
+
+BINARY_VEC_DEF(float, int, isgreaterequal, -isgreaterequal)
+BINARY_VEC_DEF(double, long, isgreaterequal, -isgreaterequal)
+
+BINARY_VEC_DEF(float, int, islessgreater, -islessgreater)
+BINARY_VEC_DEF(double, long, islessgreater, -islessgreater)
diff --git a/src/builtins/remquo.cl b/src/builtins/remquo.cl
new file mode 100644
index 0000000..1bc5094
--- /dev/null
+++ b/src/builtins/remquo.cl
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define REMQUO_SCALAR_BODY(type, op, ptr_type) \
+{ \
+ ptr_type temp; \
+ type result = op(x, y, &temp); \
+ *ptr = temp; \
+ return result; \
+} \
+
+#define REMQUO_VECTOR_BODY_2(op, ptr_type) \
+ temp.s0 = op(x.s0 ,y.s0, &(((ptr_type*)&itemp)[0])); \
+ temp.s1 = op(x.s1 ,y.s1, &(((ptr_type*)&itemp)[1])); \
+
+#define REMQUO_VECTOR_BODY_3(op, ptr_type) \
+ REMQUO_VECTOR_BODY_2(op, ptr_type) \
+ temp.s2 = op(x.s2 ,y.s2, &(((ptr_type*)&itemp)[2])); \
+
+#define REMQUO_VECTOR_BODY_4(op, ptr_type) \
+ REMQUO_VECTOR_BODY_3(op, ptr_type) \
+ temp.s3 = op(x.s3 ,y.s3, &(((ptr_type*)&itemp)[3])); \
+
+#define REMQUO_VECTOR_BODY_8(op, ptr_type) \
+ REMQUO_VECTOR_BODY_4(op, ptr_type) \
+ temp.s4 = op(x.s4 ,y.s4, &(((ptr_type*)&itemp)[4])); \
+ temp.s5 = op(x.s5 ,y.s5, &(((ptr_type*)&itemp)[5])); \
+ temp.s6 = op(x.s6 ,y.s6, &(((ptr_type*)&itemp)[6])); \
+ temp.s7 = op(x.s7 ,y.s7, &(((ptr_type*)&itemp)[7])); \
+
+#define REMQUO_VECTOR_BODY_16(op, ptr_type) \
+ REMQUO_VECTOR_BODY_8(op, ptr_type) \
+ temp.s8 = op(x.s8 ,y.s8, &(((ptr_type*)&itemp)[8])); \
+ temp.s9 = op(x.s9 ,y.s9, &(((ptr_type*)&itemp)[9])); \
+ temp.sa = op(x.sa ,y.sa, &(((ptr_type*)&itemp)[10])); \
+ temp.sb = op(x.sb ,y.sb, &(((ptr_type*)&itemp)[11])); \
+ temp.sc = op(x.sc ,y.sc, &(((ptr_type*)&itemp)[12])); \
+ temp.sd = op(x.sd ,y.sd, &(((ptr_type*)&itemp)[13])); \
+ temp.se = op(x.se ,y.se, &(((ptr_type*)&itemp)[14])); \
+ temp.sf = op(x.sf ,y.sf, &(((ptr_type*)&itemp)[15])); \
+
+#define REMQUO_VECTOR_BODY(prim_type, num, op, ptr_type) \
+{ \
+ prim_type##num temp; \
+ ptr_type##num itemp; \
+ REMQUO_VECTOR_BODY_##num(op, ptr_type)\
+ *ptr = itemp; \
+ return temp; \
+} \
+
+
+_CLC_PROTECTED float remquof(float x, float y, int * ptr);
+_CLC_PROTECTED double builtin_remquo(double x, double y, int * ptr);
+
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, global int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, local int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, private int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, global int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, local int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, private int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, global int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, local int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, private int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, global int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, local int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, private int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, global int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, local int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, private int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, global int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, local int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, private int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, global int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, local int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, private int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, global int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, local int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, private int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, global int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, local int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, private int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, global int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, local int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, private int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, global int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, local int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, private int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, global int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, local int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, private int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
diff --git a/src/builtins/rotate.cl b/src/builtins/rotate.cl
new file mode 100644
index 0000000..fc894b0
--- /dev/null
+++ b/src/builtins/rotate.cl
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+/*-----------------------------------------------------------------------------
+* The template for non rotl applicable scalar types
+*----------------------------------------------------------------------------*/
+#define SCALAR(type, utype) \
+_CLC_OVERLOAD _CLC_DEF type rotate(type v, type i) \
+{\
+ uint bits = sizeof(v) << 3;\
+ uint mask = bits - 1; \
+ i &= mask; \
+ if (i == 0) return v; \
+ return (v << i) | ((utype)v >> (bits-i)); \
+}\
+
+SCALAR(uchar, uchar)
+SCALAR(char, uchar)
+SCALAR(ushort, ushort)
+SCALAR(short, ushort)
+SCALAR(ulong, ulong)
+SCALAR(long, ulong)
+SCALAR(int, uint)
+
+BINARY_VEC_DEF(char, char, rotate, rotate)
+BINARY_VEC_DEF(uchar, uchar, rotate, rotate)
+BINARY_VEC_DEF(short, short, rotate, rotate)
+BINARY_VEC_DEF(ushort, ushort,rotate, rotate)
+BINARY_VEC_DEF(int, int, rotate, rotate)
+BINARY_VEC_DEF(uint, uint, rotate, rotate)
+BINARY_VEC_DEF(long, long, rotate, rotate)
+BINARY_VEC_DEF(ulong, ulong, rotate, rotate)
diff --git a/src/builtins/select.cl b/src/builtins/select.cl
new file mode 100644
index 0000000..52a078c
--- /dev/null
+++ b/src/builtins/select.cl
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define DECLARATION(type, itype, utype) \
+_CLC_OVERLOAD _CLC_DEF type select(type a, type b, itype c) { return c ? b : a; }\
+_CLC_OVERLOAD _CLC_DEF type select(type a, type b, utype c) { return c ? b : a; }
+
+#define SELECT_EXPAND_SIZES(type,itype,utype) \
+ DECLARATION(_VEC_TYPE(type,2), _VEC_TYPE(itype,2), _VEC_TYPE(utype,2)) \
+ DECLARATION(_VEC_TYPE(type,3), _VEC_TYPE(itype,3), _VEC_TYPE(utype,3)) \
+ DECLARATION(_VEC_TYPE(type,4), _VEC_TYPE(itype,4), _VEC_TYPE(utype,4)) \
+ DECLARATION(_VEC_TYPE(type,8), _VEC_TYPE(itype,8), _VEC_TYPE(utype,8)) \
+ DECLARATION(_VEC_TYPE(type,16), _VEC_TYPE(itype,16), _VEC_TYPE(utype,16)) \
+
+#define SELECT_EXPAND_TYPES \
+ SELECT_EXPAND_SIZES(char, char, uchar) \
+ SELECT_EXPAND_SIZES(uchar, char, uchar) \
+ SELECT_EXPAND_SIZES(short, short, ushort) \
+ SELECT_EXPAND_SIZES(ushort, short, ushort) \
+ SELECT_EXPAND_SIZES(int, int, uint) \
+ SELECT_EXPAND_SIZES(uint, int, uint) \
+ SELECT_EXPAND_SIZES(long, long, ulong) \
+ SELECT_EXPAND_SIZES(ulong, long, ulong) \
+ SELECT_EXPAND_SIZES(float, int, uint) \
+ SELECT_EXPAND_SIZES(double, long, ulong)
+
+SELECT_EXPAND_TYPES
diff --git a/src/builtins/shuffle.cl b/src/builtins/shuffle.cl
new file mode 100644
index 0000000..3ec3b56
--- /dev/null
+++ b/src/builtins/shuffle.cl
@@ -0,0 +1,215 @@
+/******************************************************************************
+ * Copyright (c) 2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "clc.h"
+
+#define TEMPLATE2(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle(res_elemt##val_vnum val, mask_elemt##2 mask) \
+{ \
+ res_elemt##2 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ return result; \
+}\
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##2 mask) \
+{ \
+ res_elemt##2 result; \
+ res_elemt *p1 = (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE4(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle(res_elemt##val_vnum val, mask_elemt##4 mask) \
+{ \
+ res_elemt##4 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##4 mask) \
+{ \
+ res_elemt##4 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE8(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle(res_elemt##val_vnum val, mask_elemt##8 mask) \
+{ \
+ res_elemt##8 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ result.s4 = p[mask.s4 & vec_step(val)-1]; \
+ result.s5 = p[mask.s5 & vec_step(val)-1]; \
+ result.s6 = p[mask.s6 & vec_step(val)-1]; \
+ result.s7 = p[mask.s7 & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##8 mask) \
+{ \
+ res_elemt##8 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \
+ p1[mask.s4 & vec_step(val1)-1]; \
+ result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \
+ p1[mask.s5 & vec_step(val1)-1]; \
+ result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \
+ p1[mask.s6 & vec_step(val1)-1]; \
+ result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \
+ p1[mask.s7 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE16(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle(res_elemt##val_vnum val, mask_elemt##16 mask) \
+{ \
+ res_elemt##16 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ result.s4 = p[mask.s4 & vec_step(val)-1]; \
+ result.s5 = p[mask.s5 & vec_step(val)-1]; \
+ result.s6 = p[mask.s6 & vec_step(val)-1]; \
+ result.s7 = p[mask.s7 & vec_step(val)-1]; \
+ result.s8 = p[mask.s8 & vec_step(val)-1]; \
+ result.s9 = p[mask.s9 & vec_step(val)-1]; \
+ result.sa = p[mask.sa & vec_step(val)-1]; \
+ result.sb = p[mask.sb & vec_step(val)-1]; \
+ result.sc = p[mask.sc & vec_step(val)-1]; \
+ result.sd = p[mask.sd & vec_step(val)-1]; \
+ result.se = p[mask.se & vec_step(val)-1]; \
+ result.sf = p[mask.sf & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##16 mask) \
+{ \
+ res_elemt##16 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \
+ p1[mask.s4 & vec_step(val1)-1]; \
+ result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \
+ p1[mask.s5 & vec_step(val1)-1]; \
+ result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \
+ p1[mask.s6 & vec_step(val1)-1]; \
+ result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \
+ p1[mask.s7 & vec_step(val1)-1]; \
+ result.s8 = mask.s8 & vec_step(val1) ? p2[mask.s8 & vec_step(val1)-1] : \
+ p1[mask.s8 & vec_step(val1)-1]; \
+ result.s9 = mask.s9 & vec_step(val1) ? p2[mask.s9 & vec_step(val1)-1] : \
+ p1[mask.s9 & vec_step(val1)-1]; \
+ result.sa = mask.sa & vec_step(val1) ? p2[mask.sa & vec_step(val1)-1] : \
+ p1[mask.sa & vec_step(val1)-1]; \
+ result.sb = mask.sb & vec_step(val1) ? p2[mask.sb & vec_step(val1)-1] : \
+ p1[mask.sb & vec_step(val1)-1]; \
+ result.sc = mask.sc & vec_step(val1) ? p2[mask.sc & vec_step(val1)-1] : \
+ p1[mask.sc & vec_step(val1)-1]; \
+ result.sd = mask.sd & vec_step(val1) ? p2[mask.sd & vec_step(val1)-1] : \
+ p1[mask.sd & vec_step(val1)-1]; \
+ result.se = mask.se & vec_step(val1) ? p2[mask.se & vec_step(val1)-1] : \
+ p1[mask.se & vec_step(val1)-1]; \
+ result.sf = mask.sf & vec_step(val1) ? p2[mask.sf & vec_step(val1)-1] : \
+ p1[mask.sf & vec_step(val1)-1]; \
+ return result; \
+}
+
+
+#define CROSS_SIZE(type1, type2) \
+TEMPLATE2(type1, 2, type2) \
+TEMPLATE2(type1, 4, type2) \
+TEMPLATE2(type1, 8, type2) \
+TEMPLATE2(type1, 16, type2) \
+TEMPLATE4(type1, 2, type2) \
+TEMPLATE4(type1, 4, type2) \
+TEMPLATE4(type1, 8, type2) \
+TEMPLATE4(type1, 16, type2) \
+TEMPLATE8(type1, 2, type2) \
+TEMPLATE8(type1, 4, type2) \
+TEMPLATE8(type1, 8, type2) \
+TEMPLATE8(type1, 16, type2) \
+TEMPLATE16(type1, 2, type2) \
+TEMPLATE16(type1, 4, type2) \
+TEMPLATE16(type1, 8, type2) \
+TEMPLATE16(type1, 16, type2) \
+
+#define CROSS_MASKTYPE(type) \
+CROSS_SIZE(type, uchar) \
+CROSS_SIZE(type, ushort) \
+CROSS_SIZE(type, uint) \
+CROSS_SIZE(type, ulong) \
+
+CROSS_MASKTYPE(char)
+CROSS_MASKTYPE(uchar)
+CROSS_MASKTYPE(short)
+CROSS_MASKTYPE(ushort)
+CROSS_MASKTYPE(int)
+CROSS_MASKTYPE(uint)
+CROSS_MASKTYPE(long)
+CROSS_MASKTYPE(ulong)
+CROSS_MASKTYPE(float)
+CROSS_MASKTYPE(double)
diff --git a/src/builtins/sign.cl b/src/builtins/sign.cl
new file mode 100644
index 0000000..e440f2f
--- /dev/null
+++ b/src/builtins/sign.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3)) \
+ IMPLEMENTATION (_VEC_TYPE(type,4)) \
+ IMPLEMENTATION (_VEC_TYPE(type,8)) \
+ IMPLEMENTATION (_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_DEF gentype sign(gentype x) \
+{ return x > (gentype)0.0 ? (gentype) 1.0 : \
+ x < (gentype)0.0 ? (gentype)-1.0 : \
+ isnan(x) ? (gentype) 0.0 : x; }\
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/sincos.cl b/src/builtins/sincos.cl
new file mode 100644
index 0000000..1552f6b
--- /dev/null
+++ b/src/builtins/sincos.cl
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED void sincosf(float x, float * sinval, float * cosval);
+_CLC_PROTECTED void builtin_sincos(double x, double * sinval, double * cosval);
+
+#define SINCOS_SCALAR_BODY(type, op) \
+{ \
+ type sin_val; \
+ type cos_val; \
+ op(x, &sin_val, &cos_val); \
+ *cosval = cos_val; \
+ return sin_val; \
+} \
+
+#define SINCOS_VECTOR_BODY_2(prim_type, op) \
+ op(x.s0, &(((prim_type*)&sin_val)[0]), &(((prim_type*)&cos_val)[0])); \
+ op(x.s1, &(((prim_type*)&sin_val)[1]), &(((prim_type*)&cos_val)[1])); \
+
+#define SINCOS_VECTOR_BODY_3(prim_type, op) \
+ SINCOS_VECTOR_BODY_2(prim_type, op) \
+ op(x.s2, &(((prim_type*)&sin_val)[2]), &(((prim_type*)&cos_val)[2])); \
+
+#define SINCOS_VECTOR_BODY_4(prim_type, op) \
+ SINCOS_VECTOR_BODY_3(prim_type, op) \
+ op(x.s3, &(((prim_type*)&sin_val)[3]), &(((prim_type*)&cos_val)[3])); \
+
+#define SINCOS_VECTOR_BODY_8(prim_type, op) \
+ SINCOS_VECTOR_BODY_4(prim_type, op) \
+ op(x.s4, &(((prim_type*)&sin_val)[4]), &(((prim_type*)&cos_val)[4])); \
+ op(x.s5, &(((prim_type*)&sin_val)[5]), &(((prim_type*)&cos_val)[5])); \
+ op(x.s6, &(((prim_type*)&sin_val)[6]), &(((prim_type*)&cos_val)[6])); \
+ op(x.s7, &(((prim_type*)&sin_val)[7]), &(((prim_type*)&cos_val)[7])); \
+
+#define SINCOS_VECTOR_BODY_16(prim_type, op) \
+ SINCOS_VECTOR_BODY_8(prim_type, op) \
+ op(x.s8, &(((prim_type*)&sin_val)[8]), &(((prim_type*)&cos_val)[8])); \
+ op(x.s9, &(((prim_type*)&sin_val)[9]), &(((prim_type*)&cos_val)[9])); \
+ op(x.sa, &(((prim_type*)&sin_val)[10]), &(((prim_type*)&cos_val)[10])); \
+ op(x.sb, &(((prim_type*)&sin_val)[11]), &(((prim_type*)&cos_val)[11])); \
+ op(x.sc, &(((prim_type*)&sin_val)[12]), &(((prim_type*)&cos_val)[12])); \
+ op(x.sd, &(((prim_type*)&sin_val)[13]), &(((prim_type*)&cos_val)[13])); \
+ op(x.se, &(((prim_type*)&sin_val)[14]), &(((prim_type*)&cos_val)[14])); \
+ op(x.sf, &(((prim_type*)&sin_val)[15]), &(((prim_type*)&cos_val)[15])); \
+
+#define SINCOS_VECTOR_BODY(prim_type, num, op) \
+{ \
+ prim_type##num sin_val; \
+ prim_type##num cos_val; \
+ SINCOS_VECTOR_BODY_##num(prim_type, op)\
+ *cosval = cos_val; \
+ return sin_val; \
+} \
+
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, global float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, local float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, private float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, global float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, local float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, private float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, global float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, local float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, private float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, global float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, local float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, private float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, global float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, local float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, private float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, global float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, local float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, private float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, global double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, local double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, private double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, global double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, local double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, private double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, global double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, local double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, private double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, global double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, local double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, private double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, global double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, local double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, private double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, global double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, local double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, private double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+
diff --git a/src/builtins/smoothstep.cl b/src/builtins/smoothstep.cl
new file mode 100644
index 0000000..96e3d2a
--- /dev/null
+++ b/src/builtins/smoothstep.cl
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x)
+{
+ float t = clamp((float)((x-edge0)/(edge1-edge0)), 0.0f, 1.0f);
+ return t * t * (3.0f - 2.0f*t);
+}
+
+_CLC_OVERLOAD _CLC_DEF double smoothstep(double edge0, double edge1, double x)
+{
+ double t = clamp((double)((x-edge0)/(edge1-edge0)), 0.0, 1.0);
+ return t * t * (3.0 - 2.0*t);
+}
+
+#define FLOAT_TEMPLATE(N) \
+_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float##N edge0, float##N edge1, float##N x) \
+{\
+ float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \
+ return t*t*(3.0f - 2.0f * t); \
+}\
+_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float edge0, float edge1, float##N x) \
+{\
+ float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \
+ return t*t*(3.0f - 2.0f * t);\
+}\
+
+
+#define DOUBLE_TEMPLATE(N) \
+_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double##N edge0, double##N edge1, double##N x) \
+{\
+ double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \
+ return t*t*(3.0 - 2.0 * t);\
+}\
+_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double edge0, double edge1, double##N x) \
+{\
+ double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \
+ return t*t*(3.0 - 2.0 * t);\
+}
+
+FLOAT_TEMPLATE(2)
+FLOAT_TEMPLATE(3)
+FLOAT_TEMPLATE(4)
+FLOAT_TEMPLATE(8)
+FLOAT_TEMPLATE(16)
+
+DOUBLE_TEMPLATE(2)
+DOUBLE_TEMPLATE(3)
+DOUBLE_TEMPLATE(4)
+DOUBLE_TEMPLATE(8)
+DOUBLE_TEMPLATE(16)
diff --git a/src/builtins/step.cl b/src/builtins/step.cl
new file mode 100644
index 0000000..daecefd
--- /dev/null
+++ b/src/builtins/step.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype step(gentype edge, gentype x) \
+ { return x < edge ? (gentype)0.0 : (gentype)1.0 ; } \
+_CLC_OVERLOAD _CLC_DEF gentype step(sgentype edge, gentype x) \
+ { return x < (gentype)edge ? (gentype)0.0 : (gentype)1.0 ; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/sub_sat.cl b/src/builtins/sub_sat.cl
new file mode 100644
index 0000000..78442f0
--- /dev/null
+++ b/src/builtins/sub_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+BINARY_VEC_DEF(char, char, sub_sat, sub_sat)
+BINARY_VEC_DEF(uchar, uchar, sub_sat, sub_sat)
+BINARY_VEC_DEF(short, short, sub_sat, sub_sat)
+BINARY_VEC_DEF(ushort, ushort,sub_sat, sub_sat)
+BINARY_VEC_DEF(int, int, sub_sat, sub_sat)
+BINARY_VEC_DEF(uint, uint, sub_sat, sub_sat)
+BINARY_VEC_DEF(long, long, sub_sat, sub_sat)
+BINARY_VEC_DEF(ulong, ulong, sub_sat, sub_sat)
diff --git a/src/builtins/upsample.cl b/src/builtins/upsample.cl
new file mode 100644
index 0000000..8415a33
--- /dev/null
+++ b/src/builtins/upsample.cl
@@ -0,0 +1,56 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+/*-----------------------------------------------------------------------------
+* Expand vector type implementations
+*----------------------------------------------------------------------------*/
+#define TEMPLATE(xtype,ytype,restype) \
+_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \
+{ return (restype)(upsample(x.lo,y.lo), upsample(x.hi,y.hi)); }
+
+#define TEMPLATE3(xtype,ytype,restype) \
+_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \
+{ return (restype)(upsample(x.s0,y.s0), upsample(x.s1,y.s1), upsample(x.s2,y.s2)); }
+
+#define EXPAND_SIZES(xtype, ytype, restype)\
+ TEMPLATE(_VEC_TYPE(xtype,2), _VEC_TYPE(ytype,2), _VEC_TYPE(restype,2))\
+ TEMPLATE3(_VEC_TYPE(xtype,3), _VEC_TYPE(ytype,3), _VEC_TYPE(restype,3))\
+ TEMPLATE(_VEC_TYPE(xtype,4), _VEC_TYPE(ytype,4), _VEC_TYPE(restype,4))\
+ TEMPLATE(_VEC_TYPE(xtype,8), _VEC_TYPE(ytype,8), _VEC_TYPE(restype,8))\
+ TEMPLATE(_VEC_TYPE(xtype,16), _VEC_TYPE(ytype,16), _VEC_TYPE(restype,16))\
+
+#define _EXPAND_UPSAMPLE_TYPES() \
+ EXPAND_SIZES(char, uchar, short) \
+ EXPAND_SIZES(uchar, uchar, ushort) \
+ EXPAND_SIZES(short, ushort, int) \
+ EXPAND_SIZES(ushort, ushort, uint) \
+ EXPAND_SIZES(int, uint, long) \
+ EXPAND_SIZES(uint, uint, ulong) \
+
+_EXPAND_UPSAMPLE_TYPES()
diff --git a/src/builtins/vload.cl b/src/builtins/vload.cl
new file mode 100644
index 0000000..2cd9a3a
--- /dev/null
+++ b/src/builtins/vload.cl
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##4)(x[(offset<<2)], x[1+(offset<<2)], x[2+(offset<<2)], x[3+(offset<<2)]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##8)(x[(offset<<3)], x[1+(offset<<3)], x[2+(offset<<3)], x[3+(offset<<3)],\
+ x[4+(offset<<3)], x[5+(offset<<3)], x[6+(offset<<3)], x[7+(offset<<3)]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##16)(x[(offset<<4)], x[1+(offset<<4)], x[2+(offset<<4)], x[3+(offset<<4)],\
+ x[4+(offset<<4)], x[5+(offset<<4)], x[6+(offset<<4)], x[7+(offset<<4)], \
+ x[8+(offset<<4)], x[9+(offset<<4)], x[10+(offset<<4)], x[11+(offset<<4)], \
+ x[12+(offset<<4)], x[13+(offset<<4)], x[14+(offset<<4)], x[15+(offset<<4)]); \
+ } \
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+ VLOAD_ADDR_SPACES(double)\
+
+VLOAD_TYPES()
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[3*offset] = vec.s0; \
+ mem[(3*offset)+1] = vec.s1; \
+ mem[(3*offset)+2] = vec.s2; \
+ } \
+ _CLC_OVERLOAD _CLC_INLINE void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[offset<<2] = vec.s0; \
+ mem[1+(offset<<2)] = vec.s1; \
+ mem[2+(offset<<2)] = vec.s2; \
+ mem[3+(offset<<2)] = vec.s3; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[(offset<<3)] = vec.s0; \
+ mem[1+(offset<<3)] = vec.s1; \
+ mem[2+(offset<<3)] = vec.s2; \
+ mem[3+(offset<<3)] = vec.s3; \
+ mem[4+(offset<<3)] = vec.s4; \
+ mem[5+(offset<<3)] = vec.s5; \
+ mem[6+(offset<<3)] = vec.s6; \
+ mem[7+(offset<<3)] = vec.s7; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[(offset<<4)] = vec.s0; \
+ mem[1+(offset<<4)] = vec.s1; \
+ mem[2+(offset<<4)] = vec.s2; \
+ mem[3+(offset<<4)] = vec.s3; \
+ mem[4+(offset<<4)] = vec.s4; \
+ mem[5+(offset<<4)] = vec.s5; \
+ mem[6+(offset<<4)] = vec.s6; \
+ mem[7+(offset<<4)] = vec.s7; \
+ mem[8+(offset<<4)] = vec.s8; \
+ mem[9+(offset<<4)] = vec.s9; \
+ mem[10+(offset<<4)] = vec.sa; \
+ mem[11+(offset<<4)] = vec.sb; \
+ mem[12+(offset<<4)] = vec.sc; \
+ mem[13+(offset<<4)] = vec.sd; \
+ mem[14+(offset<<4)] = vec.se; \
+ mem[15+(offset<<4)] = vec.sf; \
+ } \
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+ VSTORE_ADDR_SPACES(char) \
+ VSTORE_ADDR_SPACES(uchar) \
+ VSTORE_ADDR_SPACES(short) \
+ VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
+ VSTORE_ADDR_SPACES(long) \
+ VSTORE_ADDR_SPACES(ulong) \
+ VSTORE_ADDR_SPACES(float) \
+ VSTORE_ADDR_SPACES(double) \
+
+VSTORE_TYPES()
diff --git a/src/core/commandqueue.cpp b/src/core/commandqueue.cpp
new file mode 100644
index 0000000..662dad1
--- /dev/null
+++ b/src/core/commandqueue.cpp
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file commandqueue.cpp
+ * \brief Command queue
+ */
+
+#include "commandqueue.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "events.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include <stdio.h>
+
+using namespace Coal;
+
+#define OOO_QUEUE_PUSH_EVENTS_THRESHOLD 64
+
+/******************************************************************************
+* CommandQueue::CommandQueue
+******************************************************************************/
+CommandQueue::CommandQueue(Context *ctx,
+ DeviceInterface *device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret)
+: Object(Object::T_CommandQueue, ctx), p_device(device),
+ p_num_events_in_queue(0), p_num_events_on_device(0),
+ p_num_events_completed(0),
+ p_properties(properties), p_flushed(true)
+{
+ // Initialize the locking machinery
+ pthread_mutex_init(&p_event_list_mutex, 0);
+ pthread_cond_init(&p_event_list_cond, 0);
+
+ // Check that the device belongs to the context
+ if (!ctx->hasDevice(device))
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return;
+ }
+ p_device->init();
+
+ *errcode_ret = checkProperties();
+}
+
+/******************************************************************************
+* CommandQueue::~CommandQueue()
+******************************************************************************/
+CommandQueue::~CommandQueue()
+{
+ cleanReleasedEvents();
+ // Free the mutex
+ pthread_mutex_destroy(&p_event_list_mutex);
+ pthread_cond_destroy(&p_event_list_cond);
+}
+
+/******************************************************************************
+* cl_int CommandQueue::info
+******************************************************************************/
+cl_int CommandQueue::info(cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_device_id cl_device_id_var;
+ cl_context cl_context_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_QUEUE_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_QUEUE_DEVICE:
+ SIMPLE_ASSIGN(cl_device_id, p_device);
+ break;
+
+ case CL_QUEUE_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties, p_properties);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* cl_int CommandQueue::setProperty
+******************************************************************************/
+cl_int CommandQueue::setProperty(cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties *old_properties)
+{
+ if (old_properties)
+ *old_properties = p_properties;
+
+ if (enable)
+ p_properties |= properties;
+ else
+ p_properties &= ~properties;
+
+ return checkProperties();
+}
+
+/******************************************************************************
+* cl_int CommandQueue::checkProperties
+******************************************************************************/
+cl_int CommandQueue::checkProperties() const
+{
+ // Check that all the properties are valid
+ cl_command_queue_properties properties =
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE;
+
+ if ((p_properties & properties) != p_properties)
+ return CL_INVALID_VALUE;
+
+ // Check that the device handles these properties
+ cl_int result;
+
+ result = p_device->info(CL_DEVICE_QUEUE_PROPERTIES,
+ sizeof(cl_command_queue_properties),
+ &properties,
+ 0);
+
+ if (result != CL_SUCCESS)
+ return result;
+
+ if ((p_properties & properties) != p_properties)
+ return CL_INVALID_QUEUE_PROPERTIES;
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void CommandQueue::flush()
+******************************************************************************/
+void CommandQueue::flush()
+{
+ // Wait for the command queue to be in state "flushed".
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (!p_flushed)
+ pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ cleanReleasedEvents();
+}
+
+/******************************************************************************
+* void CommandQueue::finish()
+******************************************************************************/
+void CommandQueue::finish()
+{
+ // As pushEventsOnDevice doesn't remove SUCCESS events, we may need
+ // to do that here in order not to be stuck.
+ cleanEvents();
+
+ // All the queued events must have completed. When they are, they get
+ // deleted from the command queue, so simply wait for it to become empty.
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (p_num_events_in_queue != 0)
+ pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ cleanReleasedEvents();
+}
+
+/******************************************************************************
+* cl_int CommandQueue::queueEvent(Event *event)
+******************************************************************************/
+cl_int CommandQueue::queueEvent(Event *event)
+{
+ // Let the device initialize the event (for instance, a pointer at which
+ // memory would be mapped)
+ cl_int rs = p_device->initEventDeviceData(event);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ // Append the event at the end of the list
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ p_events.push_back(event);
+ p_num_events_in_queue += 1;
+ p_flushed = false;
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ // Timing info if needed
+ if (p_properties & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Queue);
+
+ // Explore the list for events we can push on the device
+ pushEventsOnDevice();
+
+ cleanReleasedEvents();
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void CommandQueue::releaseEvent()
+******************************************************************************/
+void CommandQueue::releaseEvent(Event *e)
+{
+ pthread_mutex_lock(&p_event_list_mutex);
+ p_released_events.push_back(e);
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* void CommandQueue::cleanEvents()
+******************************************************************************/
+void CommandQueue::cleanEvents()
+{
+ bool is_inorder =
+ (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ // No need to cleanEvents() every time an event finishes, so that we can
+ // save on the event traversal time. 16 is a number that can be tuned
+ // (e.g. using ooo example).
+ if (p_num_events_completed < 16 && p_num_events_on_device > 0 &&
+ p_num_events_in_queue - p_num_events_completed > 0)
+ {
+ pthread_mutex_unlock(&p_event_list_mutex);
+ return;
+ }
+
+ std::list<Event *>::iterator it = p_events.begin(), oldit;
+
+ while (it != p_events.end())
+ {
+ Event *event = *it;
+
+ if (event->status() == Event::Complete)
+ {
+ // We cannot be deleted from inside us
+ event->setReleaseParent(false);
+ oldit = it;
+ ++it;
+
+ p_num_events_in_queue -= 1;
+ p_num_events_completed -= 1;
+ p_events.erase(oldit);
+ // put Completed events into another list
+ // let main thread release/delete them
+ p_released_events.push_back(event);
+ }
+ else if (is_inorder)
+ {
+ // In Order Queue events are dispatched and completed in Order
+ break;
+ }
+ else
+ {
+ ++it;
+ }
+ }
+
+ // We have cleared the list, so wake up the sleeping threads
+ if (p_num_events_in_queue == 0)
+ pthread_cond_broadcast(&p_event_list_cond);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ // Check now if we have to be deleted
+ if (references() == 0)
+ {
+ delete this;
+ }
+}
+
+/******************************************************************************
+* void CommandQueue::cleanReleasedEvents()
+* !!! Can only be called by the main thread!!! new/delete, malloc/free are not
+* thread safe on ARM, so let main thread handle them SOLELY!
+******************************************************************************/
+void CommandQueue::cleanReleasedEvents()
+{
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (! p_released_events.empty())
+ {
+ Event *event = p_released_events.front();
+ clReleaseEvent((cl_event)event);
+ p_released_events.pop_front();
+ }
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* void CommandQueue::pushEventsOnDevice()
+* Who is calling this function:
+* (ready_event, one_event_completed_on_device)
+* (not NULL, * ): worker thread, push till this one ready event
+* ( NULL, true ): worker thread, one completes, push rest on this queue
+* ( NULL, false): main thread, queued a new event, push this queue
+******************************************************************************/
+void CommandQueue::pushEventsOnDevice(Event *ready_event,
+ bool one_event_completed_on_device)
+{
+ int non_complete_events_traversed = 0;
+ bool is_ooo = (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
+ bool do_profile = (p_properties & CL_QUEUE_PROFILING_ENABLE) != 0;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ if (one_event_completed_on_device)
+ {
+ p_num_events_on_device -= 1;
+ p_num_events_completed += 1;
+ }
+
+ // No need to push more events on Device if 1) device has already got
+ // enough to work on, and 2) not pushing won't cause starvation of this
+ // commandqueue. Not pushing can save p_event_list traversal time.
+ // 2 is a QoS number, set to 2 for the time being
+ // imagaine there are multiple commandqueues on same device
+ if(is_ooo && ready_event == NULL &&
+ p_num_events_on_device > 2 && p_device->gotEnoughToWorkOn())
+ {
+ pthread_mutex_unlock(&p_event_list_mutex);
+ return;
+ }
+
+ // Explore the events in p_events and push on the device all of them that
+ // are :
+ //
+ // - Not already pushed (in Event::Queued state)
+ // - Not after a barrier, except if we begin with a barrier
+ // - If we are in-order, only the first event in Event::Queued state can
+ // be pushed
+
+ std::list<Event *>::iterator it = p_events.begin();
+ std::list<Event *>::iterator oldit;
+ bool first = true;
+
+ // We assume that we will flush the command queue (submit all the events)
+ // This will be changed in the while() when we know that not all events
+ // are submitted.
+ p_flushed = true;
+
+ while (it != p_events.end())
+ {
+ Event *event = *it;
+
+ // If the event is completed, remove it
+ if (event->status() == Event::Complete)
+ {
+ event->setReleaseParent(false);
+ oldit = it;
+ ++it;
+
+ p_num_events_completed -= 1;
+ p_num_events_in_queue -= 1;
+ p_events.erase(oldit);
+ // put Completed events into another list
+ // let main thread release/delete them
+ p_released_events.push_back(event);
+ continue;
+ }
+
+ // If OOO queue threshold is met, skip examining the rest of events
+ if(ready_event == NULL &&
+ non_complete_events_traversed > OOO_QUEUE_PUSH_EVENTS_THRESHOLD)
+ break;
+ non_complete_events_traversed += 1;
+
+ // We cannot do out-of-order, so we can only push the first event.
+ if (!is_ooo && !first)
+ {
+ p_flushed = false; // There are remaining events.
+ break;
+ }
+
+ // Stop if we encounter a barrier that isn't the first event in the list.
+ if (event->type() == Event::Barrier && !first)
+ {
+ // We have events to wait, stop
+ p_flushed = false;
+ break;
+ }
+
+ // Completed events and first barriers are out, it remains real events
+ // that have to block in-order execution.
+ first = false;
+
+ // If the event is not "pushable" (in Event::Queued state), skip it
+ // It is either Submitted or Running.
+ if (event->status() != Event::Queued)
+ {
+ // Intended event is scheduled, skip the rest in queue
+ if (event == ready_event) break;
+
+ ++it;
+ continue;
+ }
+
+ // Check that all the waiting-on events of this event are finished
+ if (! event->waitEventsAllCompleted())
+ {
+ p_flushed = false;
+ // If we encounter a WaitForEvents event that is not "finished",
+ // don't push events after it.
+ if (event->type() == Event::WaitForEvents)
+ break;
+
+ // The event has its dependencies not already met.
+ ++it;
+ continue;
+ }
+
+ if (event->isInstantaneous())
+ {
+ // Set the event as completed. This will call pushEventsOnDevice,
+ // again, so release the lock to avoid a deadlock. We also return
+ // because the recursive call will continue our work.
+ pthread_mutex_unlock(&p_event_list_mutex);
+ event->setStatus(Event::Complete);
+ return;
+ }
+
+ // The event can be pushed, if we need to
+ if (do_profile) event->updateTiming(Event::Submit);
+
+ event->setStatus(Event::Submitted);
+ p_num_events_on_device += 1;
+ p_device->pushEvent(event);
+ }
+
+ if (ready_event != NULL && p_flushed)
+ p_flushed = (p_num_events_in_queue == 0);
+
+ if (p_flushed)
+ pthread_cond_broadcast(&p_event_list_cond);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* Event **CommandQueue::events(unsigned int &count)
+******************************************************************************/
+Event **CommandQueue::events(unsigned int &count,
+ bool include_completed_events)
+{
+ Event **result = NULL;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ count = p_num_events_in_queue;
+ if (count > 0)
+ result = (Event **)std::malloc(count * sizeof(Event *));
+
+ // Copy each event of the list into result, retaining them
+ unsigned int index = 0;
+ std::list<Event *>::iterator it = p_events.begin();
+
+ while (it != p_events.end())
+ {
+ if (! include_completed_events)
+ {
+ Event *e = *it;
+ if (e->status() == Event::Complete)
+ {
+ ++it;
+ continue;
+ }
+ }
+
+ result[index] = *it;
+ result[index]->reference();
+
+ ++it;
+ ++index;
+ }
+ count = index;
+
+ // Now result contains an immutable list of events. Even if the events
+ // become completed in another thread while result is used, the events
+ // are retained and so guaranteed to remain valid.
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ return result;
+}
+
+/******************************************************************************
+* Event::Event
+******************************************************************************/
+Event::Event(CommandQueue *parent,
+ Status status,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Object(Object::T_Event, parent),
+ p_status(status), p_device_data(0)
+{
+ // Initialize the locking machinery
+ pthread_cond_init(&p_state_change_cond, 0);
+ pthread_mutex_init(&p_state_mutex, 0);
+
+ std::memset(&p_timing, 0, sizeof(p_timing));
+
+ // Check sanity of parameters
+ if (!event_wait_list && num_events_in_wait_list)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+
+ if (event_wait_list && !num_events_in_wait_list)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+
+ // Check that none of the events in event_wait_list is in an error state
+ for (cl_uint i=0; i<num_events_in_wait_list; ++i)
+ {
+ if (event_wait_list[i] == 0)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+ else if (event_wait_list[i]->status() < 0)
+ {
+ *errcode_ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ return;
+ }
+ }
+
+ if (parent && num_events_in_wait_list > 0)
+ {
+ pthread_mutex_lock(&p_state_mutex);
+ for (cl_uint i=0; i<num_events_in_wait_list; ++i)
+ {
+ // if event_wait_list[i] is already COMPLETE, don't add it!!!
+ if (event_wait_list[i]->addDependentEvent(this))
+ p_wait_events.push_back(event_wait_list[i]);
+ }
+ pthread_mutex_unlock(&p_state_mutex);
+ }
+}
+
+/******************************************************************************
+* void Event::freeDeviceData()
+******************************************************************************/
+void Event::freeDeviceData()
+{
+ if (parent() && p_device_data)
+ {
+ DeviceInterface *device = 0;
+ ((CommandQueue *)parent())->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), &device, 0);
+
+ device->freeEventDeviceData(this);
+ }
+}
+
+/******************************************************************************
+* Event::~Event()
+******************************************************************************/
+Event::~Event()
+{
+ pthread_mutex_destroy(&p_state_mutex);
+ pthread_cond_destroy(&p_state_change_cond);
+}
+
+/******************************************************************************
+* bool Event::isInstantaneous()
+******************************************************************************/
+bool Event::isInstantaneous() const
+{
+ // A dummy event has nothing to do on an execution device and must be
+ // completed directly after being "submitted".
+
+ switch (type())
+ {
+ case Marker:
+ case User:
+ case Barrier:
+ case WaitForEvents:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/******************************************************************************
+* void Event::setStatus
+******************************************************************************/
+int Event::setStatusHelper(Status status)
+{
+ int num_dependent_events;
+
+ // TODO: If status < 0, terminate all the events depending on us.
+ pthread_mutex_lock(&p_state_mutex);
+ p_status = status;
+ num_dependent_events = p_dependent_events.size();
+
+ pthread_cond_broadcast(&p_state_change_cond);
+
+ // Call the callbacks
+ std::multimap<Status, CallbackData>::const_iterator it;
+ std::pair<std::multimap<Status, CallbackData>::const_iterator,
+ std::multimap<Status, CallbackData>::const_iterator> ret;
+
+ ret = p_callbacks.equal_range(status > 0 ? status : Complete);
+
+ for (it=ret.first; it!=ret.second; ++it)
+ {
+ const CallbackData &data = (*it).second;
+ data.callback((cl_event)this, p_status, data.user_data);
+ }
+
+ pthread_mutex_unlock(&p_state_mutex);
+
+ return num_dependent_events;
+}
+
+void Event::setStatus(Status status)
+{
+ if (type() == Event::User || (parent() && status == Complete))
+ {
+ CommandQueue *cq = (CommandQueue *) parent();
+
+ int num_dependent_events = setStatusHelper(status);
+ /*---------------------------------------------------------------------
+ * From this point on, the event could be dereferenced to 0 and deleted!
+ * Thus we cannot call flushQueues(). Need to save these queues.
+ *--------------------------------------------------------------------*/
+
+ /*---------------------------------------------------------------------
+ * Notify dependent events, remove dependence, and push them if possible
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < num_dependent_events; i += 1)
+ {
+ Event *d_event = p_dependent_events[i];
+ CommandQueue *q = (CommandQueue *) d_event->parent();
+ if (d_event->removeWaitEvent(this) && q != NULL) // order!
+ {
+ q->pushEventsOnDevice(d_event, (cq == q));
+ if (cq == q) cq = NULL;
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Inform our parent to push other events to the device if haven't done
+ * so already. UserEvent's parent is NULL.
+ *--------------------------------------------------------------------*/
+ if (cq != NULL) cq->pushEventsOnDevice(NULL, true);
+ }
+ else
+ setStatusHelper(status);
+}
+
+bool Event::addDependentEvent(Event *event)
+{
+ pthread_mutex_lock(&p_state_mutex);
+ if (p_status == Event::Complete)
+ {
+ pthread_mutex_unlock(&p_state_mutex);
+ return false;
+ }
+
+ p_dependent_events.push_back(event);
+ Object::reference(); // retain this event
+ pthread_mutex_unlock(&p_state_mutex);
+ return true;
+}
+
+bool Event::removeWaitEvent(Event *event)
+{
+ bool empty;
+
+ pthread_mutex_lock(&p_state_mutex);
+ p_wait_events.remove(event);
+ empty = p_wait_events.empty();
+ pthread_mutex_unlock(&p_state_mutex);
+
+ CommandQueue *q = (CommandQueue *) event->parent();
+ if (q != NULL) q->releaseEvent(event);
+ return empty;
+}
+
+bool Event::waitEventsAllCompleted()
+{
+// YUAN TODO: p_wait_events is always shrinking, is lock necessary?
+// it is a little bit faster without having to lock!!!
+#if 1
+ bool empty;
+
+ pthread_mutex_lock(&p_state_mutex);
+ empty = p_wait_events.empty();
+ pthread_mutex_unlock(&p_state_mutex);
+
+ return empty;
+#else
+ return p_wait_events.empty();
+#endif
+}
+
+/******************************************************************************
+* void Event::reference, dereference
+* This should be protected, since main thread and worker threads could all
+* updating the event reference count
+******************************************************************************/
+void Event::reference()
+{
+ pthread_mutex_lock(&p_state_mutex);
+ Object::reference();
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+bool Event::dereference()
+{
+ bool retval = false;
+ pthread_mutex_lock(&p_state_mutex);
+ retval = Object::dereference();
+ pthread_mutex_unlock(&p_state_mutex);
+ return retval;
+}
+
+/******************************************************************************
+* void Event::setDeviceData
+******************************************************************************/
+void Event::setDeviceData(void *data)
+{
+ p_device_data = data;
+}
+
+/******************************************************************************
+* void Event::updateTiming
+******************************************************************************/
+void Event::updateTiming(Timing timing)
+{
+ if (timing >= Max)
+ return;
+
+ pthread_mutex_lock(&p_state_mutex);
+
+ // Don't update more than one time (NDRangeKernel for example)
+ if (p_timing[timing])
+ {
+ pthread_mutex_unlock(&p_state_mutex);
+ return;
+ }
+
+ struct timespec tp;
+ cl_ulong rs;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0)
+ clock_gettime(CLOCK_REALTIME, &tp);
+
+ rs = tp.tv_nsec / 1000; // convert to microseconds
+ rs += tp.tv_sec * 1000000; // convert to microseconds
+
+ p_timing[timing] = rs;
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* Event::Status Event::status() const
+******************************************************************************/
+Event::Status Event::status() const
+{
+ // HACK : We need const qualifier but we also need to lock a mutex
+ Event *me = (Event *)(void *)this;
+
+ pthread_mutex_lock(&me->p_state_mutex);
+
+ Status ret = p_status;
+
+ pthread_mutex_unlock(&me->p_state_mutex);
+
+ return ret;
+}
+
+/******************************************************************************
+* void Event::waitForStatus(Status status)
+******************************************************************************/
+void Event::waitForStatus(Status status)
+{
+ pthread_mutex_lock(&p_state_mutex);
+
+ while (p_status != status && p_status > 0)
+ {
+ pthread_cond_wait(&p_state_change_cond, &p_state_mutex);
+ }
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* void *Event::deviceData()
+******************************************************************************/
+void *Event::deviceData()
+{
+ return p_device_data;
+}
+
+/******************************************************************************
+* void Event::setCallback
+******************************************************************************/
+void Event::setCallback(cl_int command_exec_callback_type,
+ event_callback callback,
+ void *user_data)
+{
+ CallbackData data;
+
+ data.callback = callback;
+ data.user_data = user_data;
+
+ pthread_mutex_lock(&p_state_mutex);
+
+ p_callbacks.insert(std::pair<Status, CallbackData>(
+ (Status)command_exec_callback_type,
+ data));
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* cl_int Event::info
+******************************************************************************/
+cl_int Event::info(cl_event_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_command_queue cl_command_queue_var;
+ cl_context cl_context_var;
+ cl_command_type cl_command_type_var;
+ cl_int cl_int_var;
+ cl_uint cl_uint_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_EVENT_COMMAND_QUEUE:
+ SIMPLE_ASSIGN(cl_command_queue, parent());
+ break;
+
+ case CL_EVENT_CONTEXT:
+ if (parent())
+ {
+ SIMPLE_ASSIGN(cl_context, parent()->parent());
+ }
+ else
+ {
+ if (type() == User)
+ SIMPLE_ASSIGN(cl_context, ((UserEvent *)this)->context())
+ else
+ SIMPLE_ASSIGN(cl_context, 0);
+ }
+ break;
+
+ case CL_EVENT_COMMAND_TYPE:
+ SIMPLE_ASSIGN(cl_command_type, type());
+ break;
+
+ // avoid status() call, if called from callbacks, we deadlock on mutex
+ case CL_EVENT_COMMAND_EXECUTION_STATUS:
+ SIMPLE_ASSIGN(cl_int, p_status);
+ break;
+
+ case CL_EVENT_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* cl_int Event::profilingInfo(
+******************************************************************************/
+cl_int Event::profilingInfo(cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ if (type() == Event::User)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ // Check that the Command Queue has profiling enabled
+ cl_command_queue_properties queue_props;
+ cl_int rs;
+
+ rs = ((CommandQueue *)parent())->info(CL_QUEUE_PROPERTIES,
+ sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ if ((queue_props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ // avoid status() call, if called from callbacks, we deadlock on mutex
+ if (p_status != Event::Complete)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ void *value = 0;
+ size_t value_length = 0;
+ cl_ulong cl_ulong_var;
+
+ switch (param_name)
+ {
+ case CL_PROFILING_COMMAND_QUEUED:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Queue]);
+ break;
+
+ case CL_PROFILING_COMMAND_SUBMIT:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Submit]);
+ break;
+
+ case CL_PROFILING_COMMAND_START:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Start]);
+ break;
+
+ case CL_PROFILING_COMMAND_END:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[End]);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
diff --git a/src/core/commandqueue.h b/src/core/commandqueue.h
new file mode 100644
index 0000000..7d2c65e
--- /dev/null
+++ b/src/core/commandqueue.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file commandqueue.h
+ * \brief Command queue and base class for events
+ */
+
+#ifndef __COMMANDQUEUE_H__
+#define __COMMANDQUEUE_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+#include <pthread.h>
+
+#include <map>
+#include <list>
+#include <vector>
+
+namespace Coal
+{
+
+class Context;
+class DeviceInterface;
+class Event;
+
+/**
+ * \brief Command queue
+ *
+ * This class holds a list of events that will be pushed on a given device.
+ *
+ * More details are given on the \ref events page.
+ */
+class CommandQueue : public Object
+{
+ public:
+ CommandQueue(Context *ctx,
+ DeviceInterface *device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret);
+ ~CommandQueue();
+
+ /**
+ * \brief Queue an event
+ * \param event event to be queued
+ * \return \c CL_SUCCESS if success, otherwise an error code
+ */
+ cl_int queueEvent(Event *event);
+
+ /**
+ * \brief Information about the command queue
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Set properties of the command queue
+ * \note This function is deprecated and only there for OpenCL 1.0
+ * compatibility
+ * \param properties property to enable or disable
+ * \param enable true to enable the property, false to disable it
+ * \param old_properties old value of the properties, ignored if NULL
+ * \return \c CL_SUCCESS if all is good, an error code if \p properties is
+ * invalid
+ */
+ cl_int setProperty(cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties *old_properties);
+
+ /**
+ * \brief Check the properties given
+ * \return \c CL_SUCCESS if they are valid, an error code otherwise
+ */
+ cl_int checkProperties() const;
+
+ /**
+ * \brief Push events on the device
+ *
+ * This function implements a big part of what is described in
+ * \ref events .
+ *
+ * It is called by \c Coal::Event::setStatus() when an event is
+ * completed, or by \c queueEvent(). Its purpose is to explore the list
+ * of queued events (\c p_events) and to call
+ * \c Coal::DeviceInterface::pushEvent() for each event meeting its push
+ * conditions.
+ *
+ * \param ready_event is know to be pushable, push events in the
+ * queue till this point, skip the events after this one.
+ *
+ * \param one_event_completed_on_device can be used to differentiate
+ * whether this function is called by worker thread when an event is
+ * completed, or by main thread's queueEvent().
+ *
+ * \section conditions Conditions
+ *
+ * If the command queue has the \c CL_OUT_OF_ORDER_EXEC_MODE_ENABLE
+ * property disabled, an event can be pushed only if all the previous
+ * ones in the list are completed with success. This way, an event
+ * must be completed before any other can be pushed. This ensures
+ * in-order execution.
+ *
+ * If this property is enable, more complex heuristics are used.
+ *
+ * The event list \c p_events is explored from top to bottom. At each
+ * loop iteration, checks are performed to see if the event can be pushed.
+ *
+ * - When a \c Coal::BarrierEvent is encountered, no more events can be
+ * pushed, except if the \c Coal::BarrierEvent is the first in the list,
+ * as that means there are no other events that can be pushed, so the
+ * barrier can go away
+ * - All events that are already pushed or finished are skipped
+ * - The wait list of the event is then explored to ensure that all its
+ * dependencies are met.
+ * - Finally, if the events passes all the tests, it is either pushed on
+ * the device, or simply set to \c Coal::Event::Complete if it's a
+ * dummy event (see \c Coal::Event::isInstantaneous()).
+ */
+ void pushEventsOnDevice(Event *ready_event = NULL,
+ bool one_event_completed_on_device = false);
+
+ /**
+ * \brief Push an event onto p_release_event list
+ *
+ * Later main thread will perform release event action.
+ */
+ void releaseEvent(Event *e);
+
+ /**
+ * \brief Remove from the event list completed events
+ *
+ * This function is called periodically to clean the event list from
+ * completed events.
+ *
+ * It is needed to do that out of \c pushEventsOnDevice() as deleting
+ * event may \c dereference() this command queue, and also delete it. It
+ * would produce crashes.
+ */
+ void cleanEvents();
+
+ /**
+ * \brief Release events on the released event list
+ *
+ * This function is called periodically to release the events on the
+ * released events list. This is only performed on the main thread
+ * because deleting/freeing memory from worker thread has caused
+ * weird memory problems on ARM.
+ *
+ */
+ void cleanReleasedEvents();
+
+ /**
+ * \brief Flush the command queue
+ *
+ * Pushes all the events on the device, and then return. The event
+ * don't need to be completed after this call.
+ */
+ void flush();
+
+ /**
+ * \brief Finish the command queue
+ *
+ * Pushes the events like \c flush() but also wait for them to be
+ * completed before returning.
+ */
+ void finish();
+
+ /**
+ * \brief Return all the events in the command queue
+ * \note Retains all the events
+ * \param count number of events in the event queue
+ * \param include_completed_events default to true
+ * \return events currently in the event queue
+ */
+ Event **events(unsigned int &count,
+ bool include_completed_events = true);
+
+ private:
+ DeviceInterface *p_device;
+ cl_int p_num_events_in_queue;
+ cl_int p_num_events_on_device;
+ cl_int p_num_events_completed;
+ cl_command_queue_properties p_properties;
+
+ std::list<Event *> p_events;
+ std::list<Event *> p_released_events;
+ pthread_mutex_t p_event_list_mutex;
+ pthread_cond_t p_event_list_cond;
+ bool p_flushed;
+};
+
+/**
+ * \brief Base class for all events
+ *
+ * This class contains logic common to all the events.
+ *
+ * Beside handling OpenCL-specific stuff, \c Coal::Event objects do nothing
+ * implementation-wise. They do not compile kernels, copy data around, etc.
+ * They only contain static and immutable data that is then used by the devices
+ * to actually implement the event.
+ */
+class Event : public Object
+{
+ public:
+ /**
+ * \brief Event type
+ *
+ * The allows objects using \c Coal::Event to know which event it is,
+ * and to cast it to the correct sub-class.
+ */
+ enum Type
+ {
+ NDRangeKernel = CL_COMMAND_NDRANGE_KERNEL,
+ TaskKernel = CL_COMMAND_TASK,
+ NativeKernel = CL_COMMAND_NATIVE_KERNEL,
+ ReadBuffer = CL_COMMAND_READ_BUFFER,
+ WriteBuffer = CL_COMMAND_WRITE_BUFFER,
+ CopyBuffer = CL_COMMAND_COPY_BUFFER,
+ ReadImage = CL_COMMAND_READ_IMAGE,
+ WriteImage = CL_COMMAND_WRITE_IMAGE,
+ CopyImage = CL_COMMAND_COPY_IMAGE,
+ CopyImageToBuffer = CL_COMMAND_COPY_IMAGE_TO_BUFFER,
+ CopyBufferToImage = CL_COMMAND_COPY_BUFFER_TO_IMAGE,
+ MapBuffer = CL_COMMAND_MAP_BUFFER,
+ MapImage = CL_COMMAND_MAP_IMAGE,
+ UnmapMemObject = CL_COMMAND_UNMAP_MEM_OBJECT,
+ Marker = CL_COMMAND_MARKER,
+ AcquireGLObjects = CL_COMMAND_ACQUIRE_GL_OBJECTS,
+ ReleaseGLObjects = CL_COMMAND_RELEASE_GL_OBJECTS,
+ ReadBufferRect = CL_COMMAND_READ_BUFFER_RECT,
+ WriteBufferRect = CL_COMMAND_WRITE_BUFFER_RECT,
+ CopyBufferRect = CL_COMMAND_COPY_BUFFER_RECT,
+ User = CL_COMMAND_USER,
+ Barrier,
+ WaitForEvents
+ };
+
+ /**
+ * \brief Event status
+ */
+ enum Status
+ {
+ Queued = CL_QUEUED, /*!< \brief Simply queued in a command queue */
+ Submitted = CL_SUBMITTED, /*!< \brief Submitted to a device */
+ Running = CL_RUNNING, /*!< \brief Running on the device */
+ Complete = CL_COMPLETE /*!< \brief Completed */
+ };
+
+ /**
+ * \brief Function that can be called when an event change status
+ */
+ typedef void (CL_CALLBACK *event_callback)(cl_event, cl_int, void *);
+
+ /**
+ * Structure used internally by \c Coal::Event to store for each event
+ * status the callbacks to call with the corresponding \c user_data.
+ */
+ struct CallbackData
+ {
+ event_callback callback; /*!< Function to call */
+ void *user_data; /*!< Pointer to pass as its third argument */
+ };
+
+ /**
+ * \brief Timing counters of an event
+ */
+ enum Timing
+ {
+ Queue, /*!< Time when the event was queued */
+ Submit, /*!< Time when the event was submitted to the device */
+ Start, /*!< Time when its execution began on the device */
+ End, /*!< Time when its execution finished */
+ Max /*!< Number of items in this enum */
+ };
+
+ public:
+ /**
+ * \brief Constructor
+ * \param parent parent \c Coal::CommandQueue
+ * \param status \c Status the event has when it is created
+ * \param num_events_in_wait_list number of events to wait on
+ * \param event_wait_list list of events to wait on
+ * \param errcode_ret return value
+ */
+ Event(CommandQueue *parent,
+ Status status,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ void freeDeviceData(); /*!< \brief Call \c Coal::DeviceInterface::freeEventDeviceData() */
+ virtual ~Event(); /*!< \brief Destructor */
+
+ /**
+ * \brief Type of the event
+ * \return type of the event
+ */
+ virtual Type type() const = 0;
+
+ /**
+ * \brief Dummy event
+ *
+ * A dummy event is an event that doesn't have to be pushed on a device,
+ * it is only a hint for \c Coal::CommandQueue
+ *
+ * \return true if the event is dummy
+ */
+ bool isInstantaneous() const;
+
+ /**
+ * \brief Set the event status
+ *
+ * This function calls the event callbacks, and
+ * \c Coal::CommandQueue::pushEventsOnDevice() if \p status is
+ * \c Complete .
+ *
+ * \param status new status of the event
+ */
+ void setStatus(Status status);
+
+ /**
+ * \brief Increase Event reference count
+ *
+ * This function uses mutex to protect the reference count
+ * \c update in the underlying object.
+ */
+ void reference();
+
+ /**
+ * \brief Decrease Event reference count
+ *
+ * This function uses mutex to protect the reference count
+ * \c update in the underlying object.
+ *
+ * \return true if the reference count is decreased to 0
+ */
+ bool dereference();
+
+ /**
+ * \brief Set device-specific data
+ * \param data device-specific data
+ */
+ void setDeviceData(void *data);
+
+ /**
+ * \brief Update timing info
+ *
+ * This function reads current system time and puts it in \c p_timing
+ *
+ * \param timing timing event having just finished
+ */
+ void updateTiming(Timing timing);
+
+ /**
+ * \brief Status
+ * \return status of the event
+ */
+ Status status() const;
+
+ /**
+ * \brief Wait for a specified status
+ *
+ * This function blocks until the event's status is set to \p status
+ * by another thread.
+ *
+ * \param status the status the event must have for the function to return
+ */
+ void waitForStatus(Status status);
+
+ /**
+ * \brief Device-specific data
+ * \return data set using \c setDeviceData()
+ */
+ void *deviceData();
+
+ /**
+ * \brief Add a callback for this event
+ * \param command_exec_callback_type status the event must have in order
+ * to have the callback called
+ * \param callback callback function
+ * \param user_data user data given to the callback
+ */
+ void setCallback(cl_int command_exec_callback_type,
+ event_callback callback,
+ void *user_data);
+
+ /**
+ * \brief Info about the event
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_event_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Profiling info
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int profilingInfo(cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Call \c Coal::CommandQueue::pushEventsOnDevice() for each command queue
+ * in which this event is queued or each queue with an event waiting on this event
+ */
+ void flushQueues();
+
+
+ /**
+ * \brief Add event to p_dependent_events, which will be notified when
+ * current event completes. If current event is already complete,
+ * no need to add and return false.
+ * \param event the event to be notified
+ */
+ bool addDependentEvent(Event *event);
+
+ /**
+ * \brief Remove event from p_wait_events, which should be waited on
+ * before current event can start. When p_wait_events becomes empty,
+ * return true to indicate that current event is ready to be pushed.
+ * \param event the event to be removed from p_wait_events
+ */
+ bool removeWaitEvent(Event *event);
+
+ /**
+ * \brief Check if there are no more events to wait on before current
+ * event can start.
+ */
+ bool waitEventsAllCompleted();
+
+ private:
+ /**
+ * \brief Helper function for setStatus()
+ * return number of dependent events
+ */
+ int setStatusHelper(Status status);
+
+ private:
+ pthread_cond_t p_state_change_cond;
+ pthread_mutex_t p_state_mutex;
+
+ Status p_status;
+ void *p_device_data;
+ std::multimap<Status, CallbackData> p_callbacks;
+
+ cl_uint p_timing[Max];
+
+ // p_wait_events: I should wait after these events complete
+ // p_dependent_events: when I complete, I should notify these events
+ std::list<const Event *> p_wait_events;
+ std::vector<Event *> p_dependent_events;
+};
+
+}
+
+struct _cl_command_queue : public Coal::CommandQueue
+{};
+
+struct _cl_event : public Coal::Event
+{};
+
+#endif
diff --git a/src/core/compiler.cpp b/src/core/compiler.cpp
new file mode 100644
index 0000000..d4d5240
--- /dev/null
+++ b/src/core/compiler.cpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.cpp
+ * \brief Compiler wrapper around Clang
+ */
+
+#include "compiler.h"
+#include "deviceinterface.h"
+
+#include <cstring>
+#include <cstdio>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <clang/Frontend/CompilerInvocation.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Frontend/LangStandard.h>
+#include <clang/Basic/Diagnostic.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/MemoryBuffer.h> // ASW
+#include <llvm/IR/Module.h>
+#include <llvm/IR/LLVMContext.h>
+#include <sys/stat.h>
+
+std::string get_ocl_dsp();
+
+using namespace Coal;
+
+Compiler::Compiler(DeviceInterface *device)
+: p_device(device), p_module(0), p_optimize(true), p_log_stream(p_log),
+ p_log_printer(0)
+{
+}
+
+Compiler::~Compiler()
+{
+
+}
+
+int Compiler::compile(const std::string &options,
+ llvm::MemoryBuffer *source)
+{
+ /* Set options */
+ p_options = options;
+
+ clang::CodeGenOptions &codegen_opts = p_compiler.getCodeGenOpts();
+ clang::DiagnosticOptions &diag_opts = p_compiler.getDiagnosticOpts();
+ clang::FrontendOptions &frontend_opts = p_compiler.getFrontendOpts();
+ clang::HeaderSearchOptions &header_opts = p_compiler.getHeaderSearchOpts();
+ clang::LangOptions &lang_opts = p_compiler.getLangOpts();
+ clang::TargetOptions &target_opts = p_compiler.getTargetOpts();
+ clang::PreprocessorOptions &prep_opts = p_compiler.getPreprocessorOpts();
+ clang::CompilerInvocation &invocation = p_compiler.getInvocation();
+
+ // Set codegen options
+ codegen_opts.setDebugInfo(clang::CodeGenOptions::NoDebugInfo);
+ codegen_opts.AsmVerbose = true;
+ codegen_opts.CodeModel = "default";
+
+ // level 3 is too much for the pocl transformations.
+ codegen_opts.OptimizationLevel = 2;
+
+ // Set diagnostic options
+ diag_opts.Pedantic = true;
+ diag_opts.ShowColumn = true;
+ diag_opts.ShowLocation = true;
+ diag_opts.ShowCarets = false;
+ diag_opts.ShowFixits = true;
+ diag_opts.ShowColors = false;
+ diag_opts.ErrorLimit = 19;
+ diag_opts.MessageLength = 0;
+
+ // Set frontend options
+ frontend_opts.ProgramAction = clang::frontend::EmitLLVMOnly;
+ frontend_opts.DisableFree = true;
+
+ // Set header search options
+ header_opts.Verbose = false;
+ header_opts.UseBuiltinIncludes = false;
+ header_opts.UseStandardSystemIncludes = false;
+ header_opts.UseStandardCXXIncludes = false;
+
+ // Set preprocessor options
+ prep_opts.RetainRemappedFileBuffers = true;
+ //prep_opts.ImplicitPCHInclude = "/usr/share/ti/opencl/clc.h";
+ prep_opts.Includes.push_back("clc.h");
+ prep_opts.Includes.push_back(p_device->builtinsHeader());
+
+ // Set lang options
+ lang_opts.NoBuiltin = true;
+ lang_opts.OpenCL = true;
+ lang_opts.CPlusPlus = false;
+
+ // Set target options
+ cl_device_type devtype;
+ p_device->info(CL_DEVICE_TYPE, sizeof(devtype), &devtype, 0);
+
+ if (devtype == CL_DEVICE_TYPE_CPU) {
+ // Originally: target_opts.Triple = llvm::sys::getHostTriple();
+ target_opts.Triple = llvm::sys::getDefaultTargetTriple();
+ }
+ else // devtype != CL_DEVICE_TYPE_CPU
+ {
+ // For 6X, use the 'spir' target, since it implements opencl specs
+ target_opts.Triple = "spir-unknown-unknown-unknown";
+
+ // Currently, llp6x does not handle fused multiply and add
+ // llvm intrinsics (llvm.fmuladd.*). Disable generating these
+ // intrinsics using clang -ffp-contract=off option
+ codegen_opts.setFPContractMode(clang::CodeGenOptions::FPC_Off);
+ }
+
+ // Parse the user options
+ std::istringstream options_stream(options);
+ std::string token;
+ bool Werror = false, inI = false, inD = false;
+
+#ifndef SHAMROCK_BUILD
+ // Add opencl-headers' package default install include path as location to search
+ std::string header_path(get_ocl_dsp());
+#else // TODO: /usr/include/CL is where opencl headers go, but use ENV vars?
+ std::string header_path("/usr/include/CL");
+#endif
+ header_opts.AddPath(header_path, clang::frontend::Angled, false, false);
+
+
+ while (options_stream >> token)
+ {
+ if (inI)
+ {
+ // token is an include path
+ header_opts.AddPath(token, clang::frontend::Angled, false, false);
+ inI = false;
+ continue;
+ }
+ else if (inD)
+ {
+ // token is name or name=value
+ prep_opts.addMacroDef(token);
+ inD = false;
+ continue;
+ }
+
+ //Handle -I xxx or -Ixxx. Assuming no other -I option prefix
+ if (token == "-I")
+ {
+ inI = true;
+ }
+ else if (token.compare(0,2,"-I") == 0)
+ {
+ header_opts.AddPath(token.substr(2), clang::frontend::Angled, false,
+ false);
+ }
+ //Handle -D xxx or -Dxxx. Assuming no other -D option prefix
+ else if (token == "-D")
+ {
+ inD = true;
+ }
+ else if (token.compare(0,2,"-D") == 0) //Handle -Dxxx (no space between)
+ {
+ prep_opts.addMacroDef(token.substr(2));
+ }
+ else if (token == "-cl-single-precision-constant")
+ {
+ lang_opts.SinglePrecisionConstants = true;
+ }
+ else if (token == "-cl-opt-disable")
+ {
+ p_optimize = false;
+ codegen_opts.OptimizationLevel = 0;
+ }
+ else if (token == "-cl-mad-enable")
+ {
+ codegen_opts.LessPreciseFPMAD = true;
+ }
+ else if (token == "-cl-unsafe-math-optimizations")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ }
+ else if (token == "-cl-finite-math-only")
+ {
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ }
+ else if (token == "-cl-fast-relaxed-math")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ lang_opts.FastRelaxedMath = true;
+ }
+ else if (token == "-w")
+ {
+ diag_opts.IgnoreWarnings = true;
+ }
+ else if (token == "-Werror")
+ {
+ Werror = true;
+ }
+ else if (token == "-cl-std=CL1.1")
+ {
+ }
+ else
+ {
+ return CL_INVALID_BUILD_OPTIONS;
+ }
+ }
+
+ add_macrodefs_for_supported_opencl_extensions(prep_opts);
+
+ // Set invocation options
+ //invocation.setLangDefaults(lang_opts,clang::IK_OpenCL);
+ invocation.setLangDefaults(lang_opts,clang::IK_OpenCL, clang::LangStandard::lang_opencl12);
+
+ // Create the diagnostics engine
+ p_log_printer = new clang::TextDiagnosticPrinter(p_log_stream, &diag_opts);
+ p_compiler.createDiagnostics(p_log_printer);
+
+ if (!p_compiler.hasDiagnostics())
+ return false;
+
+ p_compiler.getDiagnostics().setWarningsAsErrors(Werror);
+
+ // Feed the compiler with source
+ frontend_opts.Inputs.push_back(clang::FrontendInputFile("program.cl", clang::IK_OpenCL));
+
+ //ASW TODO cleanup
+#if 0
+ prep_opts.addRemappedFile("program.cl", source);
+#else
+
+ const llvm::StringRef s_data(source->getBuffer());
+ const llvm::StringRef s_name("<source>");
+ llvm::MemoryBuffer *buffer =
+ llvm::MemoryBuffer::getMemBuffer(s_data, s_name);
+
+ prep_opts.addRemappedFile("program.cl", buffer);
+#endif
+
+ //timespec t0, t1;
+ //clock_gettime(CLOCK_MONOTONIC, &t0);
+ // Compile
+
+ clang::CodeGenAction *Act = new clang::EmitLLVMOnlyAction(&llvm::getGlobalContext());
+ if (!p_compiler.ExecuteAction(*Act))
+ {
+ // DEBUG
+ std::cout << log() << std::endl;
+ return true;
+ }
+
+ //clock_gettime(CLOCK_MONOTONIC, &t1);
+ //printf("clang time: %6.4f secs\n",
+ //(float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9);
+
+ p_log_stream.flush();
+ p_module = Act->takeModule();
+
+ // uncomment to debug the llvm IR
+ // p_module->dump();
+
+ return false;
+}
+
+// Query the device to get list of supported OpenCL extensions. Standard
+// requires that each supported extension has a macro definition with the
+// same name as the extension
+void Compiler::add_macrodefs_for_supported_opencl_extensions
+ (clang::PreprocessorOptions &prep_opts)
+{
+ // Get the extensions string for the device
+ size_t size;
+ p_device->info(CL_DEVICE_EXTENSIONS, 0, NULL, &size);
+
+ char *extensions = new char[size + 1];
+ memset( extensions, CHAR_MIN, sizeof(char)*(size+1) );
+
+ p_device->info(CL_DEVICE_EXTENSIONS, sizeof(char)*size, extensions, NULL);
+
+ // Create macro definitions from the extension names
+ std::istringstream extensions_stream(extensions);
+ std::string token;
+
+ while (extensions_stream >> token)
+ prep_opts.addMacroDef(token);
+
+ delete [] extensions;
+}
+
+const std::string &Compiler::log() const
+{
+ return p_log;
+}
+
+const std::string &Compiler::options() const
+{
+ return p_options;
+}
+
+bool Compiler::optimize() const
+{
+ return p_optimize;
+}
+
+llvm::Module *Compiler::module() const
+{
+ return p_module;
+}
+
+void Compiler::appendLog(const std::string &log)
+{
+ p_log += log;
+}
diff --git a/src/core/compiler.h b/src/core/compiler.h
new file mode 100644
index 0000000..58788e6
--- /dev/null
+++ b/src/core/compiler.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.h
+ * \brief Compiler wrapped around Clang
+ */
+
+#ifndef __COMPILER_H__
+#define __COMPILER_H__
+
+#include <string>
+
+#include <clang/Frontend/CompilerInstance.h>
+#include <llvm/Support/raw_ostream.h>
+
+namespace llvm
+{
+ class MemoryBuffer;
+ class Module;
+}
+
+namespace clang
+{
+ class TextDiagnosticPrinter;
+}
+
+namespace Coal
+{
+
+class DeviceInterface;
+
+/**
+ * \brief Compiler using Clang
+ *
+ * This class builds a Clang instance, runs it and then retains compilation logs
+ * and produced data.
+ */
+class Compiler
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device \c Coal::DeviceInterface for which code will be compiled
+ */
+ Compiler(DeviceInterface *device);
+ ~Compiler();
+
+ /**
+ * \brief Compile \p source to produce a LLVM module
+ * \param options options given to the compiler, described in the OpenCL spec
+ * \param source source to be compiled
+ * \return true if the compilation is successful, false otherwise
+ * 2 if illegal options
+ * \sa module()
+ * \sa log()
+ */
+ int compile(const std::string &options, llvm::MemoryBuffer *source);
+
+ /**
+ * \brief Compilation log
+ * \note \c appendLog() can also be used to append custom info at the end
+ * of the log, for instance to keep compilation and linking logs
+ * in the same place
+ * \return log
+ */
+ const std::string &log() const;
+
+ /**
+ * \brief Options given at \c compile()
+ * \return options used during compilation
+ */
+ const std::string &options() const;
+
+ /**
+ * \brief Optimization enabled
+ * \return true if -cl-opt-disable was given in the options, false otherwise
+ */
+ bool optimize() const;
+
+ /**
+ * \brief LLVM module generated
+ * \return LLVM module generated by the compilation, 0 if an error occured
+ */
+ llvm::Module *module() const;
+
+ /**
+ * \brief Append a string to the log
+ *
+ * This function can be used to append linking or code-gen logs to the
+ * internal compilation log kept by this class
+ *
+ * \param log log to be appended
+ */
+ void appendLog(const std::string &log);
+
+ private:
+ DeviceInterface *p_device;
+ clang::CompilerInstance p_compiler;
+ llvm::Module *p_module;
+ bool p_optimize;
+
+ std::string p_log, p_options;
+ llvm::raw_string_ostream p_log_stream;
+ clang::TextDiagnosticPrinter *p_log_printer;
+
+ void add_macrodefs_for_supported_opencl_extensions
+ (clang::PreprocessorOptions &prep_opts);
+
+};
+
+}
+
+#endif
diff --git a/src/core/config.h b/src/core/config.h
new file mode 100644
index 0000000..e1e401b
--- /dev/null
+++ b/src/core/config.h
@@ -0,0 +1,9 @@
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#define LLVM_VERSION "3.5.0svn"
+#define COAL_VERSION ""
+
+#define MAX_WORK_DIMS 3
+
+#endif
diff --git a/src/core/config.h.cmake b/src/core/config.h.cmake
new file mode 100644
index 0000000..ccf87b7
--- /dev/null
+++ b/src/core/config.h.cmake
@@ -0,0 +1,9 @@
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#define LLVM_VERSION "@LLVM_VERSION@"
+#define COAL_VERSION "@Coal_VERSION@"
+
+#define MAX_WORK_DIMS 3
+
+#endif
diff --git a/src/core/context.cpp b/src/core/context.cpp
new file mode 100644
index 0000000..e9129ff
--- /dev/null
+++ b/src/core/context.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file context.cpp
+ * \brief Context
+ */
+
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "platform.h"
+
+#include <cstring>
+#include <cstdlib>
+
+#include <llvm/Support/TargetSelect.h>
+
+using namespace Coal;
+
+static void default_pfn_notify(const char *, const void *, size_t, void *)
+{
+ return;
+}
+
+Context::Context(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *,
+ size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret)
+: Object(Object::T_Context, 0), p_properties(0), p_pfn_notify(pfn_notify),
+ p_user_data(user_data), p_devices(0), p_num_devices(0), p_props_len(0),
+ p_platform(&the_platform)
+{
+ if (!p_pfn_notify)
+ p_pfn_notify = &default_pfn_notify;
+
+ // Intialize LLVM, this can be done more than one time per program
+ llvm::InitializeNativeTarget();
+ llvm::InitializeNativeTargetAsmPrinter();
+ llvm::InitializeNativeTargetAsmParser();
+
+ // Explore the properties
+ if (properties)
+ {
+ const unsigned char *props = (const unsigned char *)properties;
+ cl_context_properties prop;
+ size_t props_len = 0;
+
+#define GET_PROP(type, var) \
+ var = *(const type *)props; \
+ props += sizeof(type); \
+ props_len += sizeof(type);
+
+ int propset = 0;
+ while (true)
+ {
+ GET_PROP(cl_context_properties, prop)
+
+ if (!prop)
+ break;
+
+ switch (prop)
+ {
+ case CL_CONTEXT_PLATFORM:
+ if (!propset)
+ {
+ GET_PROP(cl_platform_id, p_platform);
+ propset = 1;
+ }
+ else
+ {
+ *errcode_ret = CL_INVALID_PROPERTY;
+ return;
+ }
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_PROPERTY;
+ return;
+ }
+ }
+
+ // properties may be allocated on the stack of the client application
+ // copy it into a real buffer
+ p_properties = (cl_context_properties *)std::malloc(props_len);
+ p_props_len = props_len;
+
+ if (!p_properties)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ std::memcpy((void *)p_properties, (const void *)properties, props_len);
+ }
+
+ // Verify that the platform is good
+ if (p_platform != &the_platform)
+ {
+ *errcode_ret = CL_INVALID_PLATFORM;
+ return;
+ }
+
+ // Explore the devices
+ p_devices = (DeviceInterface **)std::malloc(num_devices * sizeof(DeviceInterface *));
+ p_num_devices = num_devices;
+
+ if (!p_devices)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ cl_device_id device = devices[i];
+
+ if (device == 0)
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return;
+ }
+
+ // Verify that the device is available
+ cl_bool device_available;
+
+ *errcode_ret = device->info(CL_DEVICE_AVAILABLE,
+ sizeof(device_available),
+ &device_available,
+ 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!device_available)
+ {
+ *errcode_ret = CL_DEVICE_NOT_AVAILABLE;
+ return;
+ }
+
+ // Add the device to the list
+ p_devices[i] = (DeviceInterface *)device;
+ }
+}
+
+Context::~Context()
+{
+ if (p_properties)
+ std::free((void *)p_properties);
+
+ if (p_devices)
+ std::free((void *)p_devices);
+}
+
+cl_int Context::info(cl_context_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_CONTEXT_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_CONTEXT_NUM_DEVICES:
+ SIMPLE_ASSIGN(cl_uint, p_num_devices);
+ break;
+
+ case CL_CONTEXT_DEVICES:
+ MEM_ASSIGN(p_num_devices * sizeof(DeviceInterface *), p_devices);
+ break;
+
+ case CL_CONTEXT_PROPERTIES:
+ MEM_ASSIGN(p_props_len, p_properties);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value && value_length /* CONTEXT_PROPERTIES can be of length 0 */)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+bool Context::hasDevice(DeviceInterface *device) const
+{
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ if (p_devices[i] == device)
+ return true;
+
+ return false;
+}
diff --git a/src/core/context.h b/src/core/context.h
new file mode 100644
index 0000000..4712d25
--- /dev/null
+++ b/src/core/context.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file context.h
+ * \brief OpenCL context
+ */
+
+#ifndef __CONTEXT_H__
+#define __CONTEXT_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+
+namespace Coal
+{
+
+class DeviceInterface;
+
+/**
+ * \brief OpenCL context
+ *
+ * This class is the root of all OpenCL objects, except \c Coal::DeviceInterface.
+ */
+class Context : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param properties properties of the context
+ * \param num_devices number of devices that will be used
+ * \param devices \c Coal::DeviceInterface to be used
+ * \param pfn_notify function to call when an error arises, to give
+ * more detail
+ * \param user_data user data to pass to \p pfn_notify
+ * \param errcode_ret return code
+ */
+ Context(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *,
+ size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret);
+ ~Context();
+
+ /**
+ * \brief Info about the context
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_context_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Check that this context contains a given \p device
+ * \param device device to check
+ * \return whether this context contains \p device
+ */
+ bool hasDevice(DeviceInterface *device) const;
+
+ private:
+ cl_context_properties *p_properties;
+ void (CL_CALLBACK *p_pfn_notify)(const char *, const void *,
+ size_t, void *);
+ void *p_user_data;
+
+ DeviceInterface **p_devices;
+ unsigned int p_num_devices, p_props_len;
+ cl_platform_id p_platform;
+};
+
+}
+
+struct _cl_context : public Coal::Context
+{};
+
+#endif
diff --git a/src/core/cpu/buffer.cpp b/src/core/cpu/buffer.cpp
new file mode 100644
index 0000000..9125872
--- /dev/null
+++ b/src/core/cpu/buffer.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/buffer.cpp
+ * \brief CPU buffer
+ */
+
+#include "buffer.h"
+#include "device.h"
+
+#include "../memobject.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+CPUBuffer::CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs)
+: DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0),
+ p_data_malloced(false)
+{
+ if (buffer->type() == MemObject::SubBuffer)
+ {
+ // We need to create this CPUBuffer based on the CPUBuffer of the
+ // parent buffer
+ SubBuffer *subbuf = (SubBuffer *)buffer;
+ MemObject *parent = subbuf->parent();
+ CPUBuffer *parentcpubuf = (CPUBuffer *)parent->deviceBuffer(device);
+
+ char *tmp_data = (char *)parentcpubuf->data();
+ tmp_data += subbuf->offset();
+
+ p_data = (void *)tmp_data;
+ }
+ else if (buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ // We use the host ptr, we are already allocated
+ p_data = buffer->host_ptr();
+ }
+
+ // NOTE: This function can also reject Image buffers by setting a value
+ // != CL_SUCCESS in rs.
+}
+
+CPUBuffer::~CPUBuffer()
+{
+ if (p_data_malloced)
+ {
+ std::free((void *)p_data);
+ }
+}
+
+void *CPUBuffer::data() const
+{
+ return p_data;
+}
+
+void *CPUBuffer::nativeGlobalPointer() const
+{
+ return data();
+}
+
+bool CPUBuffer::allocate()
+{
+ size_t buf_size = p_buffer->size();
+
+ if (buf_size == 0)
+ // Something went wrong...
+ return false;
+
+ if (!p_data)
+ {
+ // We don't use a host ptr, we need to allocate a buffer
+ p_data = std::malloc(buf_size);
+
+ if (!p_data)
+ return false;
+
+ p_data_malloced = true;
+ }
+
+ if (p_buffer->type() != MemObject::SubBuffer &&
+ p_buffer->flags() & CL_MEM_COPY_HOST_PTR)
+ {
+ std::memcpy(p_data, p_buffer->host_ptr(), buf_size);
+ }
+
+ // Say to the memobject that we are allocated
+ p_buffer->deviceAllocated(this);
+
+ return true;
+}
+
+DeviceInterface *CPUBuffer::device() const
+{
+ return p_device;
+}
+
+bool CPUBuffer::allocated() const
+{
+ return p_data != 0;
+}
diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h
new file mode 100644
index 0000000..d88c9e5
--- /dev/null
+++ b/src/core/cpu/buffer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file buffer.h
+ * \brief CPU buffer
+ */
+
+#ifndef __CPU_BUFFER_H__
+#define __CPU_BUFFER_H__
+
+#include "../deviceinterface.h"
+
+namespace Coal
+{
+
+class CPUDevice;
+class MemObject;
+
+/**
+ * \brief CPU implementation of \c Coal::MemObject
+ *
+ * This class is responsible of the actual allocation of buffer objects, using
+ * \c malloc() or by reusing a given \c host_ptr.
+ */
+class CPUBuffer : public DeviceBuffer
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device Device for which the buffer is allocated
+ * \param buffer \c Coal::MemObject holding information about the buffer
+ * \param rs return code (\c CL_SUCCESS if all is good)
+ */
+ CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs);
+ ~CPUBuffer();
+
+ bool allocate();
+ DeviceInterface *device() const;
+ void *data() const; /*!< \brief Pointer to the buffer's data */
+ void *nativeGlobalPointer() const;
+ bool allocated() const;
+
+ private:
+ CPUDevice *p_device;
+ MemObject *p_buffer;
+ void *p_data;
+ bool p_data_malloced;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
new file mode 100644
index 0000000..137d34e
--- /dev/null
+++ b/src/core/cpu/builtins.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/builtins.cpp
+ * \brief Native OpenCL C built-in functions
+ *
+ * All these built-ins are directly called by kernels. When the LLVM JIT
+ * sees a function name it doesn't know, it calls \c getBuiltin() with this
+ * name as parameter. This function then returns the address of an actual
+ * function implementation, that finally gets called by the kernel when
+ * it is run.
+ */
+
+#include "builtins.h"
+#include "kernel.h"
+#include "buffer.h"
+
+#include "../events.h"
+#include "../memobject.h"
+
+#include <sys/mman.h>
+#include <signal.h>
+
+#include <llvm/IR/Function.h>
+
+#include <iostream>
+#include <cstring>
+#include <cmath>
+#include <boost/math/special_functions.hpp>
+
+#include <stdio.h>
+
+using namespace Coal;
+
+unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
+ size_t row_pitch, size_t slice_pitch,
+ unsigned int bytes_per_pixel)
+{
+ unsigned char *result = base;
+
+ result += (z * slice_pitch) +
+ (y * row_pitch) +
+ (x * bytes_per_pixel);
+
+ return result;
+}
+
+/*
+ * TLS-related functions
+ */
+__thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */
+__thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */
+__thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+{
+ g_work_group = current;
+}
+
+void *getWorkItemsData(size_t &size)
+{
+ size = work_items_size;
+ return work_items_data;
+}
+
+void setWorkItemsData(void *ptr, size_t size)
+{
+ work_items_data = ptr;
+ work_items_size = size;
+}
+
+/*
+ * Actual built-ins implementations
+ */
+cl_uint CPUKernelWorkGroup::getWorkDim() const
+{
+ return p_work_dim;
+}
+
+size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
+{
+ if (dimindx >p_work_dim)
+ return 1;
+
+ return p_event->global_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return p_event->local_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return (p_event->global_work_size(dimindx) /
+ p_event->local_work_size(dimindx));
+}
+
+size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_index[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_event->global_work_offset(dimindx);
+}
+
+void CPUKernelWorkGroup::barrier(unsigned int flags)
+{
+ p_had_barrier = true;
+
+ // Allocate or reuse TLS memory for the stacks (it isn't freed between
+ // the work groups, and even the kernels, so if we need less space than
+ // allocated, it's good)
+ if (!p_contexts)
+ {
+ if (p_current_work_item != 0)
+ {
+ // Completely abnormal, it means that not every work-items
+ // encounter the barrier
+ std::cerr << "*** Not every work-items of "
+ << p_kernel->function()->getName().str()
+ << " calls barrier(); !" << std::endl;
+ return;
+ }
+
+ // Allocate or reuse the stacks
+ size_t contexts_size;
+ p_contexts = getWorkItemsData(contexts_size);
+ size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context));
+
+ if (!p_contexts || contexts_size < needed_size)
+ {
+ // We must allocate a new space
+ if (p_contexts)
+ munmap(p_contexts, contexts_size);
+
+ p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ setWorkItemsData(p_contexts, contexts_size);
+ }
+
+ // Now that we have a real main context, initialize it
+ p_current_context = getContextAddr(0);
+ p_current_context->initialized = 1;
+ std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t));
+
+ getcontext(&p_current_context->context);
+ }
+
+ // Take the next context
+ p_current_work_item++;
+ if (p_current_work_item == p_num_work_items) p_current_work_item = 0;
+
+ Context *next = getContextAddr(p_current_work_item);
+ Context *main = getContextAddr(0); // The context not created with makecontext
+
+ // If the next context isn't initialized, initialize it.
+ // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized
+ if (next->initialized == 0)
+ {
+ next->initialized = 1;
+
+ // local-id of next is the one of the current context, but incVec'ed
+ std::memcpy(next->local_id, p_current_context->local_id,
+ MAX_WORK_DIMS * sizeof(size_t));
+
+ incVec(p_work_dim, next->local_id, p_max_local_id);
+
+ // Initialize the next context
+ if (getcontext(&next->context) != 0)
+ return;
+
+ // Get its stack. It is located a next + sizeof(Context)
+ char *stack = (char *)next;
+ stack += sizeof(Context);
+
+ next->context.uc_link = &main->context;
+ next->context.uc_stack.ss_sp = stack;
+ next->context.uc_stack.ss_size = p_stack_size;
+
+ // Tell it to run the kernel function
+ makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args);
+ }
+
+ // Switch to the next context
+ ucontext_t *cur = &p_current_context->context;
+ p_current_context = next;
+
+ swapcontext(cur, &next->context);
+
+ // When we return here, it means that all the other work items encountered
+ // a barrier and that we returned to this one. We can continue.
+}
+
+void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
+{
+ std::cout << "OpenCL: Non-existant builtin function " << name << std::endl;
+ std::cout << " found in " << p_kernel->function()->getName().str()
+ << '.' << std::endl;
+}
+
+/*
+ * Built-in functions
+ */
+
+static size_t get_global_id(cl_uint dimindx)
+{
+ return g_work_group->getGlobalId(dimindx);
+}
+
+static cl_uint get_work_dim()
+{
+ return g_work_group->getWorkDim();
+}
+
+static size_t get_global_size(uint dimindx)
+{
+ return g_work_group->getGlobalSize(dimindx);
+}
+
+static size_t get_local_size(uint dimindx)
+{
+ return g_work_group->getLocalSize(dimindx);
+}
+
+static size_t get_local_id(uint dimindx)
+{
+ return g_work_group->getLocalID(dimindx);
+}
+
+static size_t get_num_groups(uint dimindx)
+{
+ return g_work_group->getNumGroups(dimindx);
+}
+
+static size_t get_group_id(uint dimindx)
+{
+ return g_work_group->getGroupID(dimindx);
+}
+
+static size_t get_global_offset(uint dimindx)
+{
+ return g_work_group->getGlobalOffset(dimindx);
+}
+
+static void barrier(unsigned int flags)
+{
+ g_work_group->barrier(flags);
+}
+
+// Images
+
+static int get_image_width(Image2D *image)
+{
+ return image->width();
+}
+
+static int get_image_height(Image2D *image)
+{
+ return image->height();
+}
+
+static int get_image_depth(Image3D *image)
+{
+ if (image->type() != MemObject::Image3D)
+ return 1;
+
+ return image->depth();
+}
+
+static int get_image_channel_data_type(Image2D *image)
+{
+ return image->format().image_channel_data_type;
+}
+
+static int get_image_channel_order(Image2D *image)
+{
+ return image->format().image_channel_order;
+}
+
+static void *image_data(Image2D *image, int x, int y, int z, int *order, int *type)
+{
+ *order = image->format().image_channel_order;
+ *type = image->format().image_channel_data_type;
+
+ return g_work_group->getImageData(image, x, y, z);
+}
+
+static bool is_image_3d(Image3D *image)
+{
+ return (image->type() == MemObject::Image3D ? 1 : 0);
+}
+
+static void write_imagef(Image2D *image, int x, int y, int z, float *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void write_imagei(Image2D *image, int x, int y, int z, int32_t *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void write_imageui(Image2D *image, int x, int y, int z, uint32_t *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void read_imagefi(float *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageii(int32_t *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageff(float *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageif(int32_t *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageuif(uint32_t *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+/* Dummy function to plug missing ARM ABI EH fxns: */
+static void dummy_fxn(void)
+{
+}
+
+
+/*
+ * Bridge between LLVM and us
+ */
+static void unimplemented_stub()
+{
+}
+
+void *getBuiltin(const std::string &name)
+{
+ if (name == "get_global_id")
+ return (void *)&get_global_id;
+ else if (name == "get_work_dim")
+ return (void *)&get_work_dim;
+ else if (name == "get_global_size")
+ return (void *)&get_global_size;
+ else if (name == "get_local_size")
+ return (void *)&get_local_size;
+ else if (name == "get_local_id")
+ return (void *)&get_local_id;
+ else if (name == "get_num_groups")
+ return (void *)&get_num_groups;
+ else if (name == "get_group_id")
+ return (void *)&get_group_id;
+ else if (name == "get_global_offset")
+ return (void *)&get_global_offset;
+ else if (name == "barrier")
+ return (void *)&barrier;
+
+ else if (name == "__cpu_get_image_width")
+ return (void *)&get_image_width;
+ else if (name == "__cpu_get_image_height")
+ return (void *)&get_image_height;
+ else if (name == "__cpu_get_image_depth")
+ return (void *)&get_image_depth;
+ else if (name == "__cpu_get_image_channel_data_type")
+ return (void *)&get_image_channel_data_type;
+ else if (name == "__cpu_get_image_channel_order")
+ return (void *)&get_image_channel_order;
+ else if (name == "__cpu_image_data")
+ return (void *)&image_data;
+ else if (name == "__cpu_is_image_3d")
+ return (void *)&is_image_3d;
+ else if (name == "__cpu_write_imagef")
+ return (void *)&write_imagef;
+ else if (name == "__cpu_write_imagei")
+ return (void *)&write_imagei;
+ else if (name == "__cpu_write_imageui")
+ return (void *)&write_imageui;
+ else if (name == "__cpu_read_imagefi")
+ return (void *)&read_imagefi;
+ else if (name == "__cpu_read_imageii")
+ return (void *)&read_imageii;
+ else if (name == "__cpu_read_imageuii")
+ return (void *)&read_imageuii;
+ else if (name == "__cpu_read_imageff")
+ return (void *)&read_imageff;
+ else if (name == "__cpu_read_imageif")
+ return (void *)&read_imageif;
+ else if (name == "__cpu_read_imageuif")
+ return (void *)&read_imageuif;
+
+ else if (name == "debug")
+ return (void *)&printf;
+ else if (name == "__aeabi_unwind_cpp_pr0")
+ return (void *)&dummy_fxn;
+ else if (name == "__aeabi_unwind_cpp_pr1")
+ return (void *)&dummy_fxn;
+ else if (name == "__aeabi_unwind_cpp_pr2")
+ return (void *)&dummy_fxn;
+
+ // Math library disambiguation for OpenCL double functions of the same name.
+ else if (name == "builtin_sincos")
+ return (void *)&sincos;
+ else if (name == "builtin_lgamma_r")
+ return (void *)&lgamma_r;
+ else if (name == "builtin_modf")
+ return (void *)&modf;
+ else if (name == "builtin_remquo")
+ return (void *)&remquo;
+ else if (name == "builtin_pow")
+ return (void *)&pow;
+ else if (name == "builtin_exp10f")
+ return (void *)&exp10f;
+ else if (name == "builtin_exp10")
+ return (void *)&exp10;
+
+#if 0
+ // Other misc functions Khronos tests say are builtins, though not in the spec!
+ else if (name == "memcpy")
+ return (void *)&memcpy;
+#endif
+
+ // Function not found
+ g_work_group->builtinNotFound(name);
+
+ return (void *)&unimplemented_stub;
+}
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
new file mode 100644
index 0000000..69143ea
--- /dev/null
+++ b/src/core/cpu/builtins.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file builtins.h
+ * \brief CPU built-in functions
+ */
+#ifndef __BUILTINS_H__
+#define __BUILTINS_H__
+
+#include <string>
+
+namespace Coal {
+ class CPUKernelWorkGroup;
+}
+
+/**
+ * \brief Set the current kernel work-group of this thread
+ * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group.
+ */
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+
+/**
+ * \brief Return the address of a built-in function given its name
+ * \param name name of the built-in whose address is requested
+ */
+void *getBuiltin(const std::string &name);
+
+/**
+ * \brief Work-item stacks
+ * \see \ref barrier
+ * \param size size of the allocated space for stacks
+ * \return address of the allocated space for stacks
+ */
+void *getWorkItemsData(size_t &size);
+
+/**
+ * \brief Set work-item stacks
+ * \see \ref barrier
+ * \param ptr address of allocated space for stacks
+ * \param size size of the allocated space for stacks
+ */
+void setWorkItemsData(void *ptr, size_t size);
+
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ * element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ * maximum value and couldn't be further incremented.
+ */
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+ bool overflow = false;
+
+ for (unsigned int i=0; i<dims; ++i)
+ {
+ vec[i] += 1;
+
+ if (vec[i] > maxs[i])
+ {
+ vec[i] = 0;
+ overflow = true;
+ }
+ else
+ {
+ overflow = false;
+ break;
+ }
+ }
+
+ return overflow;
+}
+
+/**
+ * \brief Address of a pixel in an image
+ *
+ * This function is heavily used when Clover needs to address a pixel or a byte
+ * in a rectangular or three-dimensional image or buffer.
+ *
+ * \param base address of the first pixel in the image (address of the image itself)
+ * \param x X coordinate, cannot be bigger or equal to \c width
+ * \param y Y coordinate, cannot be bigger or equal to \c height
+ * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays)
+ * \param row_pitch size in bytes of a row of pixels in the image
+ * \param slice_pitch size in bytes of a slice in a 3D array
+ * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when
+ * coordinates are in pixels and not in bytes.
+ */
+unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
+ size_t row_pitch, size_t slice_pitch,
+ unsigned int bytes_per_pixel);
+
+#endif
+
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp
new file mode 100644
index 0000000..eb3fcb1
--- /dev/null
+++ b/src/core/cpu/device.cpp
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/device.cpp
+ * \brief CPU Device
+ */
+
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "program.h"
+#include "worker.h"
+#include "builtins.h"
+
+#include <core/config.h>
+#include "../propertylist.h"
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+#include "../program.h"
+#include "../util.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+using namespace Coal;
+
+#if !(defined(DSPC868X) || defined(SHAMROCK_BUILD))
+#include "../dsp/shmem.h"
+// unsigned arm_speed();
+#endif
+
+#define ONE_GIGABYTE (1 << 30)
+
+CPUDevice::CPUDevice()
+: DeviceInterface(), p_cores(0), p_num_events(0), p_workers(0), p_stop(false),
+ p_initialized(false)
+{
+ // Get info about the system
+ p_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ p_cpu_mhz = 0.0f;
+
+ std::filebuf fb;
+ fb.open("/proc/cpuinfo", std::ios::in);
+ std::istream is(&fb);
+
+ while (!is.eof())
+ {
+ std::string key, value;
+
+ std::getline(is, key, ':');
+ is.ignore(1);
+ std::getline(is, value);
+
+ if (key.compare(0, 7, "cpu MHz") == 0)
+ {
+ std::istringstream ss(value);
+ ss >> p_cpu_mhz;
+ }
+
+ if (key.compare(0, 10, "model name") == 0)
+ p_device_name = value;
+
+ if (key.compare(0, 9, "Processor") == 0)
+ p_device_name = value;
+ }
+
+ if (p_cpu_mhz == 0.0f)
+ {
+ std::string file("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
+ std::ifstream fs(file.c_str());
+ if (fs) { fs >> p_cpu_mhz; p_cpu_mhz /= 1000; }
+ }
+
+ if (p_cpu_mhz == 0.0f) p_cpu_mhz = 1000.0;
+
+#if !defined(DSPC868X)
+ // p_cpu_mhz = arm_speed();
+#endif
+}
+
+
+void CPUDevice::init()
+{
+ if (p_initialized) return;
+
+ // Initialize the locking machinery
+ pthread_cond_init(&p_events_cond, 0);
+ pthread_mutex_init(&p_events_mutex, 0);
+
+ // Create worker threads
+ p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t));
+
+ for (unsigned int i=0; i<numCPUs(); ++i)
+ {
+ pthread_create(&p_workers[i], 0, &worker, this);
+ }
+
+ p_initialized = true;
+}
+
+CPUDevice::~CPUDevice()
+{
+ if (!p_initialized)
+ return;
+
+ // Terminate the workers and wait for them
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_stop = true;
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+
+ for (unsigned int i=0; i<numCPUs(); ++i)
+ {
+ pthread_join(p_workers[i], 0);
+ }
+
+ // Free allocated memory
+ std::free((void *)p_workers);
+ pthread_mutex_destroy(&p_events_mutex);
+ pthread_cond_destroy(&p_events_cond);
+}
+
+DeviceBuffer *CPUDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs)
+{
+ return (DeviceBuffer *)new CPUBuffer(this, buffer, rs);
+}
+
+DeviceProgram *CPUDevice::createDeviceProgram(Program *program)
+{
+ return (DeviceProgram *)new CPUProgram(this, program);
+}
+
+DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel,
+ llvm::Function *function)
+{
+ return (DeviceKernel *)new CPUKernel(this, kernel, function);
+}
+
+cl_int CPUDevice::initEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::MapBuffer:
+ {
+ MapBufferEvent *e = (MapBufferEvent *)event;
+ CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(this);
+ unsigned char *data = (unsigned char *)buf->data();
+
+ data += e->offset();
+
+ e->setPtr((void *)data);
+ break;
+ }
+ case Event::MapImage:
+ {
+ MapImageEvent *e = (MapImageEvent *)event;
+ Image2D *image = (Image2D *)e->buffer();
+ CPUBuffer *buf = (CPUBuffer *)image->deviceBuffer(this);
+ unsigned char *data = (unsigned char *)buf->data();
+
+ data = imageData(data,
+ e->origin(0),
+ e->origin(1),
+ e->origin(2),
+ image->row_pitch(),
+ image->slice_pitch(),
+ image->pixel_size());
+
+ e->setPtr((void *)data);
+ e->setRowPitch(image->row_pitch());
+ e->setSlicePitch(image->slice_pitch());
+ break;
+ }
+ case Event::UnmapMemObject:
+ // Nothing do to
+ break;
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ // Instantiate the JIT for the CPU program
+ KernelEvent *e = (KernelEvent *)event;
+ Program *p = (Program *)e->kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)p->deviceDependentProgram(this);
+
+ if (!prog->initJIT())
+ return CL_INVALID_PROGRAM_EXECUTABLE;
+
+ // Set device-specific data
+ CPUKernelEvent *cpu_e = new CPUKernelEvent(this, e);
+ e->setDeviceData((void *)cpu_e);
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ return CL_SUCCESS;
+}
+
+void CPUDevice::freeEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ CPUKernelEvent *cpu_e = (CPUKernelEvent *)event->deviceData();
+
+ if (cpu_e)
+ delete cpu_e;
+ }
+ default:
+ break;
+ }
+}
+
+void CPUDevice::pushEvent(Event *event)
+{
+ // Add an event in the list
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_events.push_back(event);
+ p_num_events++; // Way faster than STL list::size() !
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+}
+
+Event *CPUDevice::getEvent(bool &stop)
+{
+ // Return the first event in the list, if any. Remove it if it is a
+ // single-shot event.
+ pthread_mutex_lock(&p_events_mutex);
+
+ while (p_num_events == 0 && !p_stop)
+ pthread_cond_wait(&p_events_cond, &p_events_mutex);
+
+ if (p_stop)
+ {
+ pthread_mutex_unlock(&p_events_mutex);
+ stop = true;
+ return 0;
+ }
+
+ Event *event = p_events.front();
+
+ // If the run of this event will finish it, remove it from the list
+ bool last_slot = true;
+
+ if (event->type() == Event::NDRangeKernel ||
+ event->type() == Event::TaskKernel)
+ {
+ CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData();
+ last_slot = ke->reserve();
+ }
+
+ if (last_slot)
+ {
+ p_num_events--;
+ p_events.pop_front();
+ }
+
+ pthread_mutex_unlock(&p_events_mutex);
+
+ return event;
+}
+
+/******************************************************************************
+* Device's decision about whether CommandQueue should push more events over
+* This number could be tuned (e.g. using ooo example). Note that p_num_events
+* are in device's queue, but not yet executed.
+******************************************************************************/
+bool CPUDevice::gotEnoughToWorkOn()
+{
+ return p_num_events > 0;
+}
+
+unsigned int CPUDevice::numCPUs() const
+{
+ return p_cores;
+}
+
+float CPUDevice::cpuMhz() const
+{
+ return p_cpu_mhz;
+}
+
+// From inner parentheses to outher ones :
+//
+// sizeof * 8 => 8
+// -1 => 7
+// 1 << $ => 10000000
+// -1 => 01111111
+// *2 => 11111110
+// +1 => 11111111
+//
+// A simple way to do this is (1 << (sizeof(type) * 8)) - 1, but it overflows
+// the type (for int8, 1 << $ = 100000000 = 256 > 255)
+#define TYPE_MAX(type) ((((type)1 << ((sizeof(type) * 8) - 1)) - 1) * 2 + 1)
+
+cl_int CPUDevice::info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_device_type cl_device_type_var;
+ cl_uint cl_uint_var;
+ size_t size_t_var;
+ cl_ulong cl_ulong_var;
+ cl_bool cl_bool_var;
+ cl_device_fp_config cl_device_fp_config_var;
+ cl_device_mem_cache_type cl_device_mem_cache_type_var;
+ cl_device_local_mem_type cl_device_local_mem_type_var;
+ cl_device_exec_capabilities cl_device_exec_capabilities_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ cl_platform_id cl_platform_id_var;
+ size_t work_dims[MAX_WORK_DIMS];
+ };
+
+ switch (param_name)
+ {
+ case CL_DEVICE_TYPE:
+ SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_CPU);
+ break;
+
+ case CL_DEVICE_VENDOR_ID:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ SIMPLE_ASSIGN(cl_uint, numCPUs());
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS);
+ break;
+
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, ONE_GIGABYTE);
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ for (int i=0; i<MAX_WORK_DIMS; ++i)
+ {
+ work_dims[i] = ONE_GIGABYTE;
+ }
+ value_length = MAX_WORK_DIMS * sizeof(size_t);
+ value = &work_dims;
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ SIMPLE_ASSIGN(cl_uint, cpuMhz());
+ break;
+
+ case CL_DEVICE_ADDRESS_BITS:
+ SIMPLE_ASSIGN(cl_uint, 8*sizeof(void *));
+ break;
+
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ SIMPLE_ASSIGN(size_t, 65536);
+ break;
+
+ case CL_DEVICE_MAX_SAMPLERS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ SIMPLE_ASSIGN(cl_uint, 1024 /* sizeof(long16)*8) */); // 128 byte
+ break;
+
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ // TODO: Check what an x86 SSE engine can support.
+ // Currently not supporting CL_FP_DENORM
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST);
+ break;
+
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ // These are minimally required to be supported by the OCL spec:
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
+ CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ SIMPLE_ASSIGN(cl_device_mem_cache_type,
+ CL_READ_WRITE_CACHE);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ // TODO: Get this information from the processor
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ // TODO: Get this information from the processor
+ SIMPLE_ASSIGN(cl_ulong, 512*1024*1024);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ // parse /proc/meminfo to get the value
+ SIMPLE_ASSIGN(cl_ulong, parse_file_line_value("/proc/meminfo",
+ "MemTotal:", 512*1024) * 1024);
+ break;
+
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ // TODO: 1 Gio seems to be enough for software acceleration
+
+#if defined(__arm__)
+ SIMPLE_ASSIGN(cl_ulong, 512*1024*1024);
+#else
+ SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024);
+#endif
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 65536);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ SIMPLE_ASSIGN(cl_device_local_mem_type, CL_GLOBAL);
+ break;
+
+
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ // TODO
+ SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 ms
+ break;
+
+ case CL_DEVICE_ENDIAN_LITTLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL |
+ CL_EXEC_NATIVE_KERNEL);
+ break;
+
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE);
+ break;
+
+ case CL_DEVICE_NAME:
+ value_length = p_device_name.size() + 1;
+ value = const_cast<char*>(p_device_name.c_str());
+ break;
+
+ case CL_DEVICE_VENDOR:
+ STRING_ASSIGN("Generic");
+ break;
+
+ case CL_DRIVER_VERSION:
+ STRING_ASSIGN("" COAL_VERSION);
+ break;
+
+ case CL_DEVICE_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_DEVICE_VERSION:
+ STRING_ASSIGN("OpenCL 1.1 " COAL_VERSION);
+ break;
+
+ case CL_DEVICE_EXTENSIONS:
+ STRING_ASSIGN("cl_khr_global_int32_base_atomics"
+ " cl_khr_global_int32_extended_atomics"
+ " cl_khr_local_int32_base_atomics"
+ " cl_khr_local_int32_extended_atomics"
+ " cl_khr_byte_addressable_store"
+
+ " cl_khr_fp64"
+ " cl_khr_int64_base_atomics"
+ " cl_khr_int64_extended_atomics")
+
+ break;
+
+ case CL_DEVICE_PLATFORM:
+ SIMPLE_ASSIGN(cl_platform_id, 0);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_OPENCL_C_VERSION:
+ STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+#if !defined(DSPC868X)
+#if 0 // /dev/mem is no longer available
+unsigned arm_speed()
+{
+ //return 1000.0;
+ const unsigned TETRIS_PLL = 125000000;
+ const unsigned pagesize = 0x1000;
+
+ shmem_persistent page;
+ page.configure(0x02620000, pagesize);
+ char *host_msmc = (char*)page.map(0x02620000, pagesize);
+ unsigned SECPLLCTL0 = *(unsigned*)(host_msmc + 0x370);
+ unsigned prediv = 1 + (SECPLLCTL0 & 0x3F);
+ unsigned mult = 1 + ((SECPLLCTL0 >> 6) & 0x1FFF);
+ unsigned output_div = 1 + ((SECPLLCTL0 >> 19) & 0xF);
+ unsigned speed = TETRIS_PLL * mult / prediv / output_div;
+ page.unmap(host_msmc, pagesize);
+
+ return speed / 1000000;
+}
+#endif
+#endif
+
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
new file mode 100644
index 0000000..a0ad6ef
--- /dev/null
+++ b/src/core/cpu/device.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/device.h
+ * \brief CPU device
+ */
+
+#ifndef __CPU_DEVICE_H__
+#define __CPU_DEVICE_H__
+
+#include "../deviceinterface.h"
+
+#include <pthread.h>
+#include <list>
+#include <string>
+
+namespace Coal
+{
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+/**
+ * \brief CPU device
+ *
+ * This class is the base of all the CPU-accelerated OpenCL processing. It
+ * creates and manages subclasses such as \c Coal::DeviceBuffer,
+ * \c Coal::DeviceProgram and \c Coal::DeviceKernel.
+ *
+ * This class and the aforementioned ones work together to compile and run
+ * kernels using the LLVM JIT, manage buffers, provide built-in functions
+ * and do all of this in a multithreaded fashion using worker threads.
+ *
+ * \see \ref events
+ */
+class CPUDevice : public DeviceInterface
+{
+ public:
+ CPUDevice();
+ ~CPUDevice();
+
+ /**
+ * \brief Initialize the CPU device
+ *
+ * This function creates the worker threads and get information about
+ * the host system for the \c numCPUs() and \c cpuMhz functions.
+ */
+ void init();
+
+ cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
+ DeviceProgram *createDeviceProgram(Program *program);
+ DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function);
+
+ cl_int initEventDeviceData(Event *event);
+ void freeEventDeviceData(Event *event);
+
+ void pushEvent(Event *event);
+ Event *getEvent(bool &stop);
+ bool gotEnoughToWorkOn();
+
+ unsigned int numCPUs() const; /*!< \brief Number of logical CPU cores on the system */
+ float cpuMhz() const; /*!< \brief Speed of the CPU in Mhz */
+
+ std::string builtinsHeader(void) const { return "cpu.h"; }
+
+ private:
+ unsigned int p_cores, p_num_events;
+ float p_cpu_mhz;
+ std::string p_device_name;
+ pthread_t *p_workers;
+
+ std::list<Event *> p_events;
+ pthread_cond_t p_events_cond;
+ pthread_mutex_t p_events_mutex;
+ bool p_stop, p_initialized;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
new file mode 100644
index 0000000..ef09f6b
--- /dev/null
+++ b/src/core/cpu/kernel.cpp
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/kernel.cpp
+ * \brief CPU kernel
+ */
+
+#include "kernel.h"
+#include "device.h"
+#include "buffer.h"
+#include "program.h"
+#include "builtins.h"
+
+#include "../kernel.h"
+#include "../memobject.h"
+#include "../events.h"
+#include "../program.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <sys/mman.h>
+
+using namespace Coal;
+
+CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
+: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
+ p_call_function(0)
+{
+ pthread_mutex_init(&p_call_function_mutex, 0);
+
+ const char *fn_name;
+
+ // If we can reuse the same function between work groups, do it
+/* tag out for now if (p_call_function)
+ {
+ llvm::Function *rs = p_call_function;
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return rs;
+ } */
+
+ /* Create a stub function in the form of
+ *
+ * void stub(void *args) {
+ * kernel(*(int *)((char *)args + 0),
+ * *(float **)((char *)args + sizeof(int)),
+ * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
+ * }
+ *
+ * In LLVM, it is exprimed in the form of :
+ *
+ * @stub(i8* args) {
+ * kernel(
+ * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
+ * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
+ * ...
+ * );
+ * }
+ */
+ fn_name = kernel->p_name.c_str();
+ Program *p = (Program *)kernel->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device));
+ //llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name);
+
+ char * s_name = (char *) malloc(strlen(fn_name)+6);
+ sprintf(s_name,"_stub%s",fn_name);
+
+ llvm::FunctionType *kernel_function_type = function->getFunctionType();
+ llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
+ function->getReturnType(),
+ llvm::Type::getInt8PtrTy(
+ function->getContext()),
+ false);
+ llvm::Function *stub_function = llvm::Function::Create(
+ stub_function_type,
+ llvm::Function::InternalLinkage,
+ s_name,
+ function->getParent());
+
+ // Insert a basic block
+ llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
+ function->getContext(),
+ "",
+ stub_function);
+
+ // Create the function arguments
+ llvm::Argument &stub_arg = stub_function->getArgumentList().front();
+ llvm::SmallVector<llvm::Value *, 8> args;
+ size_t args_offset = 0;
+
+ for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
+ {
+ llvm::Type *param_type = kernel_function_type->getParamType(i);
+ llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
+ const Kernel::Arg *arg = p_kernel->arg(i);
+
+ // Calculate the size of the arg
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ // Get where to place this argument
+ size_t arg_offset = typeOffset(args_offset, arg_size);
+
+ // %1 = getelementptr(args, $arg_offset);
+ llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
+ &stub_arg,
+ llvm::ConstantInt::get(stub_function->getContext(),
+ llvm::APInt(64, arg_offset)),
+ "",
+ basic_block);
+
+ // %2 = bitcast(%1, $param_type_ptr)
+ llvm::Value *bitcast = new llvm::BitCastInst(
+ getelementptr,
+ param_type_ptr,
+ "",
+ basic_block);
+
+ // %3 = load(%2)
+ llvm::Value *load = new llvm::LoadInst(
+ bitcast,
+ "",
+ false,
+ arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps
+ basic_block);
+
+ // We have the value, send it to the function
+ args.push_back(load);
+ }
+
+ // Create the call instruction
+ llvm::CallInst *call_inst = llvm::CallInst::Create(
+ function,
+ args,
+ "",
+ basic_block);
+ call_inst->setCallingConv(function->getCallingConv());
+ call_inst->setTailCall();
+
+ // Create a return instruction to end the stub
+ llvm::ReturnInst::Create(
+ function->getContext(),
+ basic_block);
+
+ // Retain the function if it can be reused
+ p_call_function = stub_function;
+
+}
+
+CPUKernel::~CPUKernel()
+{
+ if (p_call_function)
+ p_call_function->eraseFromParent();
+
+ pthread_mutex_destroy(&p_call_function_mutex);
+}
+
+size_t CPUKernel::workGroupSize()
+{
+ // Just use CL_DEVICE_MAX_WORK_GROUP_SIZE
+ size_t param_value;
+ size_t param_value_size_ret;
+
+ p_device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+ &param_value, &param_value_size_ret);
+
+ return param_value;
+}
+
+cl_ulong CPUKernel::localMemSize() const
+{
+ return 0; // TODO
+}
+
+cl_ulong CPUKernel::privateMemSize() const
+{
+ return 0; // TODO
+}
+
+size_t CPUKernel::preferredWorkGroupSizeMultiple() const
+{
+ unsigned int cpus = p_device->numCPUs();
+ return cpus;
+}
+
+template<typename T>
+T k_exp(T base, unsigned int e)
+{
+ T rs = base;
+
+ for (unsigned int i=1; i<e; ++i)
+ rs *= base;
+
+ return rs;
+}
+
+// Try to find the size a work group has to have to be executed the fastest on
+// the CPU.
+size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const
+{
+ unsigned int cpus = p_device->numCPUs();
+
+ // Don't break in too small parts
+ if (k_exp(global_work_size, num_dims) > 64)
+ return global_work_size;
+
+ // Find the divisor of global_work_size the closest to cpus but >= than it
+ unsigned int divisor = cpus;
+
+ while (true)
+ {
+ if ((global_work_size % divisor) == 0)
+ break;
+
+ // Don't let the loop go up to global_work_size, the overhead would be
+ // too huge
+ if (divisor > global_work_size || divisor > cpus * 32)
+ {
+ divisor = 1; // Not parallel but has no CommandQueue overhead
+ break;
+ }
+ }
+
+ // Return the size
+ return global_work_size / divisor;
+}
+
+llvm::Function *CPUKernel::function() const
+{
+ return p_function;
+}
+
+Kernel *CPUKernel::kernel() const
+{
+ return p_kernel;
+}
+
+CPUDevice *CPUKernel::device() const
+{
+ return p_device;
+}
+
+// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
+template <class T>
+T next_power_of_two(T k) {
+ if (k == 0)
+ return 1;
+ k--;
+ for (int i=1; i<sizeof(T)*8; i<<=1)
+ k = k | k >> i;
+ return k+1;
+}
+
+size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
+{
+ size_t rs = offset;
+
+ // Align offset to stype_len
+ type_len = next_power_of_two(type_len);
+ size_t mask = ~(type_len - 1);
+
+ while (rs & mask != rs)
+ rs++;
+
+ // Where to try to place the next value
+ offset = rs + type_len;
+
+ return rs;
+}
+
+llvm::Function *CPUKernel::callFunction()
+{
+ const char *fn_name;
+ pthread_mutex_lock(&p_call_function_mutex);
+
+ // If we can reuse the same function between work groups, do it
+ if (p_call_function)
+ {
+ llvm::Function *rs = p_call_function;
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return rs;
+ }
+
+ /* Create a stub function in the form of
+ *
+ * void stub(void *args) {
+ * kernel(*(int *)((char *)args + 0),
+ * *(float **)((char *)args + sizeof(int)),
+ * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
+ * }
+ *
+ * In LLVM, it is exprimed in the form of :
+ *
+ * @stub(i8* args) {
+ * kernel(
+ * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
+ * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
+ * ...
+ * );
+ * }
+ */
+ fn_name = kernel()->p_name.c_str();
+ Program *p = (Program *)kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device()));
+ llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name);
+
+
+ llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
+ llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
+ p_function->getReturnType(),
+ llvm::Type::getInt8PtrTy(
+ p_function->getContext()),
+ false);
+ llvm::Function *stub_function = llvm::Function::Create(
+ stub_function_type,
+ llvm::Function::InternalLinkage,
+ "stub",
+ p_function->getParent());
+
+ // Insert a basic block
+ llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
+ p_function->getContext(),
+ "",
+ stub_function);
+
+ // Create the function arguments
+ llvm::Argument &stub_arg = stub_function->getArgumentList().front();
+ llvm::SmallVector<llvm::Value *, 8> args;
+ size_t args_offset = 0;
+
+ for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
+ {
+ llvm::Type *param_type = kernel_function_type->getParamType(i);
+ llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
+ const Kernel::Arg *arg = p_kernel->arg(i);
+
+ // Calculate the size of the arg
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ // Get where to place this argument
+ size_t arg_offset = typeOffset(args_offset, arg_size);
+
+ // %1 = getelementptr(args, $arg_offset);
+ llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
+ &stub_arg,
+ llvm::ConstantInt::get(stub_function->getContext(),
+ llvm::APInt(64, arg_offset)),
+ "",
+ basic_block);
+
+ // %2 = bitcast(%1, $param_type_ptr)
+ llvm::Value *bitcast = new llvm::BitCastInst(
+ getelementptr,
+ param_type_ptr,
+ "",
+ basic_block);
+
+ // %3 = load(%2)
+ llvm::Value *load = new llvm::LoadInst(
+ bitcast,
+ "",
+ false,
+ arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps
+ basic_block);
+
+ // We have the value, send it to the function
+ args.push_back(load);
+ }
+
+ // Create the call instruction
+ llvm::CallInst *call_inst = llvm::CallInst::Create(
+ t_function,
+ args,
+ "",
+ basic_block);
+ call_inst->setCallingConv(p_function->getCallingConv());
+ call_inst->setTailCall();
+
+ // Create a return instruction to end the stub
+ llvm::ReturnInst::Create(
+ p_function->getContext(),
+ basic_block);
+
+ // Retain the function if it can be reused
+ p_call_function = stub_function;
+
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return stub_function;
+}
+
+/*
+ * CPUKernelEvent
+ */
+CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
+: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
+ p_kernel_args(0)
+{
+ // Mutex
+ pthread_mutex_init(&p_mutex, 0);
+
+ // Set current work group to (0, 0, ..., 0)
+ std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));
+
+ // Populate p_max_work_groups
+ p_num_wg = 1;
+
+ for (cl_uint i=0; i<event->work_dim(); ++i)
+ {
+ p_max_work_groups[i] =
+ (event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n
+
+ p_num_wg *= p_max_work_groups[i] + 1;
+ }
+}
+
+CPUKernelEvent::~CPUKernelEvent()
+{
+ pthread_mutex_destroy(&p_mutex);
+
+ if (p_kernel_args)
+ std::free(p_kernel_args);
+}
+
+bool CPUKernelEvent::reserve()
+{
+ // Lock, this will be unlocked in takeInstance()
+ pthread_mutex_lock(&p_mutex);
+
+ // Last work group if current == max - 1
+ return (p_current_wg == p_num_wg - 1);
+}
+
+bool CPUKernelEvent::finished()
+{
+ bool rs;
+
+ pthread_mutex_lock(&p_mutex);
+
+ rs = (p_finished_wg == p_num_wg);
+
+ pthread_mutex_unlock(&p_mutex);
+
+ return rs;
+}
+
+void CPUKernelEvent::workGroupFinished()
+{
+ pthread_mutex_lock(&p_mutex);
+
+ p_finished_wg++;
+
+ pthread_mutex_unlock(&p_mutex);
+}
+
+CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
+{
+ CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(),
+ p_event,
+ this,
+ p_current_work_group);
+
+ // Increment current work group
+ incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups);
+ p_current_wg += 1;
+
+ // Release event
+ pthread_mutex_unlock(&p_mutex);
+
+ return wg;
+}
+
+void *CPUKernelEvent::kernelArgs() const
+{
+ return p_kernel_args;
+}
+
+void CPUKernelEvent::cacheKernelArgs(void *args)
+{
+ p_kernel_args = args;
+}
+
+/*
+ * CPUKernelWorkGroup
+ */
+CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
+ CPUKernelEvent *cpu_event,
+ const size_t *work_group_index)
+: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
+ p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
+ p_had_barrier(false)
+{
+
+ // Set index
+ std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
+
+ // Set maxs and global id
+ p_num_work_items = 1;
+
+ for (unsigned int i=0; i<p_work_dim; ++i)
+ {
+ p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+ p_num_work_items *= event->local_work_size(i);
+
+ // Set global id
+ p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
+ + event->global_work_offset(i);
+ }
+}
+
+CPUKernelWorkGroup::~CPUKernelWorkGroup()
+{
+ p_cpu_event->workGroupFinished();
+}
+
+void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free)
+{
+ if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
+ {
+ // We have cached the args and can reuse them
+ return p_cpu_event->kernelArgs();
+ }
+
+ // We need to create them from scratch
+ void *rs;
+
+ size_t args_size = 0;
+
+ for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg *arg = p_kernel->kernel()->arg(i);
+ CPUKernel::typeOffset(args_size, arg->valueSize() * arg->vecDim());
+ }
+
+ rs = std::malloc(args_size);
+
+ if (!rs)
+ return NULL;
+
+ size_t arg_offset = 0;
+
+ for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg *arg = p_kernel->kernel()->arg(i);
+ size_t size = arg->valueSize() * arg->vecDim();
+ size_t offset = CPUKernel::typeOffset(arg_offset, size);
+
+ // Where to place the argument
+ unsigned char *target = (unsigned char *)rs;
+ target += offset;
+
+ // We may have to perform some changes in the values (buffers, etc)
+ switch (arg->kind())
+ {
+ case Kernel::Arg::Buffer:
+ {
+ MemObject *buffer = *(MemObject **)arg->data();
+
+ if (arg->file() == Kernel::Arg::Local)
+ {
+ // Alloc a buffer and pass it to the kernel
+ void *local_buffer = std::malloc(arg->allocAtKernelRuntime());
+ locals_to_free.push_back(local_buffer);
+ *(void **)target = local_buffer;
+ }
+ else
+ {
+ if (!buffer)
+ {
+ // We can do that, just send NULL
+ *(void **)target = NULL;
+ }
+ else
+ {
+ // Get the CPU buffer, allocate it and get its pointer
+ CPUBuffer *cpubuf =
+ (CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
+ void *buf_ptr = 0;
+
+ buffer->allocate(p_kernel->device());
+ buf_ptr = cpubuf->data();
+
+ *(void **)target = buf_ptr;
+ }
+ }
+
+ break;
+ }
+ case Kernel::Arg::Image2D:
+ case Kernel::Arg::Image3D:
+ {
+ // We need to ensure the image is allocated
+ Image2D *image = *(Image2D **)arg->data();
+ image->allocate(p_kernel->device());
+
+ // Fall through to the memcpy
+ }
+ default:
+ // Simply copy the arg's data into the buffer
+ std::memcpy(target, arg->data(), size);
+ break;
+ }
+ }
+
+ // Cache the arguments if we can do so
+ if (!p_kernel->kernel()->hasLocals())
+ p_cpu_event->cacheKernelArgs(rs);
+
+ return rs;
+}
+
+bool CPUKernelWorkGroup::run()
+{
+ // Get the kernel function to call
+ std::vector<void *> locals_to_free;
+ llvm::Function *kernel_func = p_kernel->callFunction();
+
+ if (!kernel_func)
+ return false;
+
+ Program *p = (Program *)p_kernel->kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
+
+ // Make object usable for execution: (only applies to MCJIT):
+ prog->jit()->finalizeObject();
+
+ std::string kname = kernel_func->getName().str();
+
+ // original
+ p_kernel_func_addr =
+ (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func);
+
+ // TAG
+ // llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->p_kernel->p_name->str());
+// llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->kernel()->p_name.c_str());
+// p_kernel_func_addr = (void(*)(void *))prog->jit()->getPointerToFunction(t_func);
+ p_kernel_func_addr =(void(*)(void *)) prog->jit()->getFunctionAddress(kname);
+
+ // Get the arguments
+ p_args = callArgs(locals_to_free);
+
+ // Tell the builtins this thread will run a kernel work group
+ setThreadLocalWorkGroup(this);
+
+ // Initialize the dummy context used by the builtins before a call to barrier()
+ p_current_work_item = 0;
+ p_current_context = &p_dummy_context;
+
+ std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));
+
+ do
+ {
+ // Simply call the "call function", it and the builtins will do the rest
+ p_kernel_func_addr(p_args);
+ } while (!p_had_barrier &&
+ !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
+
+ // If no barrier() call was made, all is fine. If not, only the first
+ // work-item has currently finished. We must let the others run.
+ if (p_had_barrier)
+ {
+ Context *main_context = p_current_context; // After the first swapcontext,
+ // we will not be able to trust
+ // p_current_context anymore.
+
+ // We'll call swapcontext for each remaining work-item. They will
+ // finish, and when they'll do so, this main context will be resumed, so
+ // it's easy (i starts from 1 because the main context already finished)
+ for (unsigned int i=1; i<p_num_work_items; ++i)
+ {
+ Context *ctx = getContextAddr(i);
+ swapcontext(&main_context->context, &ctx->context);
+ }
+ }
+
+ // Free the allocated locals
+ if (p_kernel->kernel()->hasLocals())
+ {
+ for (size_t i=0; i<locals_to_free.size(); ++i)
+ {
+ std::free(locals_to_free[i]);
+ }
+
+ std::free(p_args);
+ }
+
+ return true;
+}
+
+CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
+{
+ size_t size;
+ char *data = (char *)p_contexts;
+
+ // Each Context in data is an element of size p_stack_size + sizeof(Context)
+ size = p_stack_size + sizeof(Context);
+ size *= index; // To get an offset
+
+ return (Context *)(data + size); // Pointer to the context
+}
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
new file mode 100644
index 0000000..ab4d1ac
--- /dev/null
+++ b/src/core/cpu/kernel.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/kernel.h
+ * \brief CPU kernel
+ */
+
+#ifndef __CPU_KERNEL_H__
+#define __CPU_KERNEL_H__
+
+#include "../deviceinterface.h"
+#include <core/config.h>
+
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <vector>
+#include <string>
+
+#include <ucontext.h>
+#include <pthread.h>
+#include <stdint.h>
+
+namespace llvm
+{
+ class Function;
+}
+
+namespace Coal
+{
+
+class CPUDevice;
+class Kernel;
+class KernelEvent;
+class Image2D;
+class Image3D;
+
+/**
+ * \brief CPU kernel
+ *
+ * This class holds passive information about a kernel (\c Coal::Kernel object
+ * and device on which it is run) and provides the \c callFunction() function.
+ *
+ * This function is described at the end of \ref llvm .
+ *
+ * \see Coal::CPUKernelWorkGroup
+ */
+class CPUKernel : public DeviceKernel
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device device on which the kernel will be run
+ * \param kernel \c Coal::Kernel object holding information about this
+ * kernel
+ * \param function \c llvm::Function to run
+ */
+ CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
+ ~CPUKernel();
+
+ size_t workGroupSize();
+ cl_ulong localMemSize() const;
+ cl_ulong privateMemSize() const;
+ size_t preferredWorkGroupSizeMultiple() const;
+ size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const;
+
+ Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */
+ CPUDevice *device() const; /*!< \brief device on which the kernel will be run */
+
+ llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
+ llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */
+
+ /**
+ * \brief Calculate where to place a value in an array
+ *
+ * This function is used to calculate where to place a value in an
+ * array given its size, properly aligning it.
+ *
+ * This function is called repeatedly to obtain the aligned position of
+ * each value that must be place in the array
+ *
+ * \code
+ * size_t array_len = 0, array_offset = 0;
+ * void *array;
+ *
+ * // First, get the array size given alignment constraints
+ * typeOffset(array_len, sizeof(int));
+ * typeOffset(array_len, sizeof(float));
+ * typeOffset(array_len, sizeof(void *));
+ *
+ * // Then, allocate memory
+ * array = malloc(array_len)
+ *
+ * // Finally, place the arguments
+ * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
+ * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
+ * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
+ * \endcode
+ *
+ * \param offset offset at which the value will be placed. This variable
+ * gets incremented by <tt>type_len + padding</tt>.
+ * \param type_len size in bytes of the value that will be stored
+ * \return offset at which the value will be stored (equal to \p offset
+ * before incrementation.
+ */
+ static size_t typeOffset(size_t &offset, size_t type_len);
+
+ private:
+ CPUDevice *p_device;
+ Kernel *p_kernel;
+ llvm::Function *p_function, *p_call_function;
+ pthread_mutex_t p_call_function_mutex;
+};
+
+class CPUKernelEvent;
+
+/**
+ * \brief CPU kernel work-group
+ *
+ * This class represent a bulk of work-items that will be run. It is the one
+ * to actually run the kernel of its elements.
+ *
+ * \see \ref llvm
+ * \nosubgrouping
+ */
+class CPUKernelWorkGroup
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param kernel kernel to run
+ * \param event event containing information about the kernel run
+ * \param cpu_event CPU-specific information and cache about \p event
+ * \param work_group_index index of this work-group in the kernel
+ */
+ CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
+ CPUKernelEvent *cpu_event,
+ const size_t *work_group_index);
+ ~CPUKernelWorkGroup();
+
+ /**
+ * \brief Build a structure of arguments
+ *
+ * As C doesn't support calling functions with variable arguments
+ * unknown at the compilation, this function builds the list of
+ * arguments in memory. This array will then be passed to a LLVM stub
+ * function reading it and passing its values to the actuel kernel.
+ *
+ * \see \ref llvm
+ * \param locals_to_free if this kernel takes \c __local arguments, they
+ * must be \c malloc()'ed for every work-group.
+ * They are placed in this vector to be
+ * \c free()'ed at the end of \c run().
+ * \return address of a memory location containing the arguments
+ */
+ void *callArgs(std::vector<void *> &locals_to_free);
+
+ /**
+ * \brief Run the work-group
+ *
+ * This function is the core of CPU-acceleration. It runs the work-items
+ * of this work-group given the correct arguments.
+ *
+ * \see \ref llvm
+ * \see \ref barrier
+ * \see callArgs()
+ * \return true if success, false in case of an error
+ */
+ bool run();
+
+ /**
+ * \name Native implementation of built-in OpenCL C functions
+ * @{
+ */
+ size_t getGlobalId(cl_uint dimindx) const;
+ cl_uint getWorkDim() const;
+ size_t getGlobalSize(cl_uint dimindx) const;
+ size_t getLocalSize(cl_uint dimindx) const;
+ size_t getLocalID(cl_uint dimindx) const;
+ size_t getNumGroups(cl_uint dimindx) const;
+ size_t getGroupID(cl_uint dimindx) const;
+ size_t getGlobalOffset(cl_uint dimindx) const;
+
+ void barrier(unsigned int flags);
+
+ void *getImageData(Image2D *image, int x, int y, int z) const;
+
+ void writeImage(Image2D *image, int x, int y, int z, float *color) const;
+ void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const;
+ void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const;
+
+ void readImage(float *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ void readImage(int32_t *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ void readImage(uint32_t *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+
+ void readImage(float *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ void readImage(int32_t *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ /**
+ * @}
+ */
+
+ /**
+ * \brief Function called when a built-in name cannot be found
+ */
+ void builtinNotFound(const std::string &name) const;
+
+ private:
+ template<typename T>
+ void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const;
+ template<typename T>
+ void readImageImplI(T *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ template<typename T>
+ void readImageImplF(T *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ template<typename T>
+ void linear3D(T *result, float a, float b, float c,
+ int i0, int j0, int k0, int i1, int j1, int k1,
+ Image3D *image) const;
+ template<typename T>
+ void linear2D(T *result, float a, float b, float c, int i0, int j0,
+ int i1, int j1, Image2D *image) const;
+
+ private:
+ CPUKernel *p_kernel;
+ CPUKernelEvent *p_cpu_event;
+ KernelEvent *p_event;
+ cl_uint p_work_dim;
+ size_t p_index[MAX_WORK_DIMS],
+ p_max_local_id[MAX_WORK_DIMS],
+ p_global_id_start_offset[MAX_WORK_DIMS];
+
+ void (*p_kernel_func_addr)(void *);
+ void *p_args;
+
+ // Machinery to have barrier() working
+ struct Context
+ {
+ size_t local_id[MAX_WORK_DIMS];
+ ucontext_t context;
+ unsigned int initialized;
+ };
+
+ Context *getContextAddr(unsigned int index);
+
+ Context *p_current_context;
+ Context p_dummy_context;
+ void *p_contexts;
+ size_t p_stack_size;
+ unsigned int p_num_work_items, p_current_work_item;
+ bool p_had_barrier;
+};
+
+/**
+ * \brief CPU-specific information about a kernel event
+ *
+ * This class put in a \c Coal::KernelEvent device-data field
+ * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
+ * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
+ */
+class CPUKernelEvent
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device device running the kernel
+ * \param event \c Coal::KernelEvent holding device-agnostic data
+ * about the event
+ */
+ CPUKernelEvent(CPUDevice *device, KernelEvent *event);
+ ~CPUKernelEvent();
+
+ bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */
+ bool finished(); /*!< \brief All the work groups have finished */
+ CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
+
+ void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */
+ void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */
+
+ void workGroupFinished(); /*!< \brief A work-group has just finished */
+
+ private:
+ CPUDevice *p_device;
+ KernelEvent *p_event;
+ size_t p_current_work_group[MAX_WORK_DIMS],
+ p_max_work_groups[MAX_WORK_DIMS];
+ size_t p_current_wg, p_finished_wg, p_num_wg;
+ pthread_mutex_t p_mutex;
+ void *p_kernel_args;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp
new file mode 100644
index 0000000..7eb632c
--- /dev/null
+++ b/src/core/cpu/program.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/program.cpp
+ * \brief CPU program
+ */
+
+#include "program.h"
+#include "device.h"
+#include "kernel.h"
+#include "builtins.h"
+
+#include "../program.h"
+
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/Support/ErrorHandling.h>
+
+#include <string>
+#include <iostream>
+
+using namespace Coal;
+using namespace llvm;
+
+// Create a custom memory manager for MCJIT
+class ClientMemoryManager : public SectionMemoryManager
+{
+ ClientMemoryManager(const ClientMemoryManager&) LLVM_DELETED_FUNCTION;
+ void operator=(const ClientMemoryManager&) LLVM_DELETED_FUNCTION;
+
+public:
+ ClientMemoryManager() {}
+ virtual ~ClientMemoryManager() {}
+
+ /// This method returns the (host) address of the specified function.
+ virtual uint64_t getSymbolAddress(const std::string &Name);
+};
+
+uint64_t ClientMemoryManager::getSymbolAddress(const std::string &Name)
+{
+ // Try the standard symbol resolution first, but ask it not to abort.
+ uint64_t addr = RTDyldMemoryManager::getSymbolAddress(Name);
+ if (!addr) {
+ addr = (uint64_t)getBuiltin(Name);
+ }
+
+ if (!addr)
+ report_fatal_error("OpenCL program references external function '" + Name +
+ "' which could not be resolved!");
+ return addr;
+}
+
+CPUProgram::CPUProgram(CPUDevice *device, Program *program)
+: DeviceProgram(), p_device(device), p_program(program), p_jit(0)
+{
+
+}
+
+CPUProgram::~CPUProgram()
+{
+ if (p_jit)
+ {
+ // Dont delete the module
+ p_jit->removeModule(p_module);
+
+ delete p_jit;
+ }
+}
+
+bool CPUProgram::linkStdLib() const
+{
+ return true;
+}
+
+void CPUProgram::createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier)
+{
+ if (optimize)
+ {
+ /*
+ * Inspired by code from "The LLVM Compiler Infrastructure"
+ */
+ manager->add(llvm::createDeadArgEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createFunctionInliningPass());
+ manager->add(llvm::createPruneEHPass()); // Remove dead EH info.
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createGlobalDCEPass()); // Remove dead functions.
+ manager->add(llvm::createArgumentPromotionPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createScalarReplAggregatesPass());
+ manager->add(llvm::createFunctionAttrsPass()); // Add nocapture.
+ manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis.
+ manager->add(llvm::createLICMPass()); // Hoist loop invariants.
+ manager->add(llvm::createGVNPass()); // Remove redundancies.
+ manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys.
+ manager->add(llvm::createDeadStoreEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createCFGSimplificationPass());
+ }
+}
+
+bool CPUProgram::build(llvm::Module *module, std::string *binary_str)
+{
+ // Nothing to build
+ p_module = module;
+
+ return true;
+}
+
+bool CPUProgram::initJIT()
+{
+ if (p_jit)
+ return true;
+
+ if (!p_module)
+ return false;
+
+ // Create the JIT
+ std::string err;
+
+ p_jit = llvm::EngineBuilder(p_module)
+ .setErrorStr(&err)
+ .setUseMCJIT(true)
+ .setMCJITMemoryManager(new ClientMemoryManager())
+ .create();
+
+ if (!p_jit)
+ {
+ std::cout << "Unable to create a JIT: " << err << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+llvm::ExecutionEngine *CPUProgram::jit() const
+{
+ return p_jit;
+}
diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h
new file mode 100644
index 0000000..0a08d61
--- /dev/null
+++ b/src/core/cpu/program.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/program.h
+ * \brief CPU program
+ */
+
+#ifndef __CPU_PROGRAM_H__
+#define __CPU_PROGRAM_H__
+
+#include "../deviceinterface.h"
+
+namespace llvm
+{
+ class ExecutionEngine;
+ class Module;
+}
+
+namespace Coal
+{
+
+class CPUDevice;
+class Program;
+
+/**
+ * \brief CPU program
+ *
+ * This class implements the \c Coal::DeviceProgram interface for CPU
+ * acceleration.
+ *
+ * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode,
+ * in \c initJIT().
+ */
+class CPUProgram : public DeviceProgram
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device CPU device to which this program is attached
+ * \param program \c Coal::Program that will be run
+ */
+ CPUProgram(CPUDevice *device, Program *program);
+ ~CPUProgram();
+
+ bool linkStdLib() const;
+ void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false);
+ bool build(llvm::Module *module, std::string *binary_str);
+
+ /**
+ * \brief Initialize an LLVM JIT
+ *
+ * This function creates a \c llvm::JIT object to run this program on
+ * the CPU. A few implementation details :
+ *
+ * - The JIT is set not to resolve unknown symbols using \c dlsym().
+ * This way, a malicious kernel cannot execute arbitrary code on
+ * the host by declaring \c libc functions and calling them.
+ * - All the unknown function names are passed to \c getBuiltin() to
+ * get native built-in implementations.
+ *
+ * \return true if success, false otherwise
+ */
+ bool initJIT();
+ llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */
+
+ private:
+ CPUDevice *p_device;
+ Program *p_program;
+
+ llvm::ExecutionEngine *p_jit;
+ llvm::Module *p_module;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp
new file mode 100644
index 0000000..893e66e
--- /dev/null
+++ b/src/core/cpu/sampler.cpp
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/sampler.cpp
+ * \brief OpenCL C image access functions
+ *
+ * It is recommended to compile this file using Clang as it supports the
+ * \c __builtin_shufflevector() built-in function, providing SSE or
+ * NEON-accelerated code.
+ */
+
+#include "../memobject.h"
+#include "../sampler.h"
+#include "kernel.h"
+#include "buffer.h"
+#include "builtins.h"
+
+#include <cstdlib>
+#include <cmath>
+// ASW #include <immintrin.h>
+
+using namespace Coal;
+
+/*
+ * Helper functions
+ */
+
+static int clamp(int a, int b, int c)
+{
+ return (a < b) ? b : ((a > c) ? c : a);
+}
+
+static int min(int a, int b)
+{
+ return (a < b ? a : b);
+}
+
+static int max(int a, int b)
+{
+ return (a > b ? a : b);
+}
+
+static float frac(float x)
+{
+ return x - std::floor(x);
+}
+
+static float round(float x)
+{
+ return (float)(int)x;
+}
+
+static bool handle_address_mode(Image2D *image, int &x, int &y, int &z,
+ uint32_t sampler)
+{
+ bool is_3d = (image->type() == MemObject::Image3D);
+ int w = image->width(),
+ h = image->height(),
+ d = (is_3d ? ((Image3D *)image)->depth() : 1);
+
+ if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP_TO_EDGE)
+ {
+ x = clamp(x, 0, w - 1);
+ y = clamp(y, 0, h - 1);
+ if (is_3d) z = clamp(z, 0, d - 1);
+ }
+ else if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP)
+ {
+ x = clamp(x, 0, w);
+ y = clamp(y, 0, h);
+ if (is_3d) z = clamp(z, 0, d);
+ }
+
+ return (x == w || y == h || z == d);
+}
+
+/*
+ * Macros or functions used to accelerate the functions
+ */
+#ifndef __has_builtin
+ #define __has_builtin(x) 0
+#endif
+
+static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b,
+ int x, int y, int z, int w)
+{
+ rs[0] = (x < 4 ? a[x] : b[x - 4]);
+ rs[1] = (y < 4 ? a[y] : b[y - 4]);
+ rs[2] = (z < 4 ? a[z] : b[z - 4]);
+ rs[3] = (w < 4 ? a[w] : b[w - 4]);
+}
+
+static void convert_to_format(void *dest, float *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_FLOAT)
+ std::memcpy(dest, data, channels * sizeof(float));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SNORM_INT8:
+ ((int8_t *)dest)[i] = data[i] * 128.0f;
+ break;
+ case CL_SNORM_INT16:
+ ((int16_t *)dest)[i] = data[i] * 32767.0f;
+ break;
+ case CL_UNORM_INT8:
+ ((uint8_t *)dest)[i] = data[i] * 255.0f;
+ break;
+ case CL_UNORM_INT16:
+ ((uint16_t *)dest)[i] = data[i] * 65535.0f;
+ break;
+ }
+ }
+}
+
+static void convert_from_format(float *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_FLOAT)
+ std::memcpy(data, source, channels * sizeof(float));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SNORM_INT8:
+ data[i] = (float)((int8_t *)source)[i] / 127.0f;
+ break;
+ case CL_SNORM_INT16:
+ data[i] = (float)((int16_t *)source)[i] / 32767.0f;
+ break;
+ case CL_UNORM_INT8:
+ data[i] = (float)((uint8_t *)source)[i] / 127.0f;
+ break;
+ case CL_UNORM_INT16:
+ data[i] = (float)((uint16_t *)source)[i] / 127.0f;
+ break;
+ }
+ }
+}
+
+static void convert_to_format(void *dest, int *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_SIGNED_INT32)
+ std::memcpy(dest, data, channels * sizeof(int32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SIGNED_INT8:
+ ((int8_t *)dest)[i] = data[i];
+ break;
+ case CL_SIGNED_INT16:
+ ((int16_t *)dest)[i] = data[i];
+ break;
+ }
+ }
+}
+
+static void convert_from_format(int32_t *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_SIGNED_INT32)
+ std::memcpy(data, source, channels * sizeof(int32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SIGNED_INT8:
+ data[i] = ((int8_t *)source)[i];
+ break;
+ case CL_SIGNED_INT16:
+ data[i] = ((int16_t *)source)[i];
+ break;
+ }
+ }
+}
+
+static void convert_to_format(void *dest, uint32_t *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_UNSIGNED_INT32)
+ std::memcpy(dest, data, channels * sizeof(uint32_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ switch (type)
+ {
+ case CL_UNSIGNED_INT8:
+ ((uint8_t *)dest)[i] = data[i];
+ break;
+ case CL_UNSIGNED_INT16:
+ ((uint16_t *)dest)[i] = data[i];
+ break;
+ }
+ }
+}
+
+static void convert_from_format(uint32_t *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_UNSIGNED_INT32)
+ std::memcpy(data, source, channels * sizeof(uint32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_UNSIGNED_INT8:
+ data[i] = ((uint8_t *)source)[i];
+ break;
+ case CL_UNSIGNED_INT16:
+ data[i] = ((uint16_t *)source)[i];
+ break;
+ }
+ }
+}
+
+template<typename T>
+static void vec4_scalar_mul(T *vec, float val)
+{
+ for (unsigned int i=0; i<4; ++i)
+ vec[i] *= val;
+}
+
+template<typename T>
+static void vec4_add(T *vec1, T *vec2)
+{
+ for (unsigned int i=0; i<4; ++i)
+ vec1[i] += vec2[i];
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear3D(T *result, float a, float b, float c,
+ int i0, int j0, int k0, int i1, int j1, int k1,
+ Image3D *image) const
+{
+ T accum[4];
+
+ readImageImplI<T>(result, image, i0, j0, k0, 0);
+ vec4_scalar_mul(result, (1.0f - a) * (1.0f - b) * (1.0f - c ));
+
+ readImageImplI<T>(accum, image, i1, j0, k0, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b) * (1.0f - c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, k0, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b * (1.0f - c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, k0, 0);
+ vec4_scalar_mul(accum, a * b * (1.0f -c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j0, k1, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * (1.0f - b) * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j0, k1, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b) * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, k1, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, k1, 0);
+ vec4_scalar_mul(accum, a * b * c);
+ vec4_add(result, accum);
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear2D(T *result, float a, float b, float c, int i0, int j0,
+ int i1, int j1, Image2D *image) const
+{
+ T accum[4];
+
+ readImageImplI<T>(result, image, i0, j0, 0, 0);
+ vec4_scalar_mul(result, (1.0f - a) * (1.0f - b));
+
+ readImageImplI<T>(accum, image, i1, j0, 0, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, 0, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, 0, 0);
+ vec4_scalar_mul(accum, a * b);
+ vec4_add(result, accum);
+}
+
+#if __has_builtin(__builtin_shufflevector)
+ #define shuffle4(rs, a, b, x, y, z, w) \
+ *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \
+ x, y, z, w)
+#else
+ #define shuffle4(rs, a, b, x, y, z, w) \
+ slow_shuffle4(rs, a, b, x, y, z, w)
+#endif
+
+static void swizzle(uint32_t *target, uint32_t *source,
+ cl_channel_order order, bool reading, uint32_t t_max)
+{
+ uint32_t special[4] = {0, t_max, 0, 0 };
+
+ if (reading)
+ {
+ switch (order)
+ {
+ case CL_R:
+ case CL_Rx:
+ // target = {source->x, 0, 0, t_max}
+ shuffle4(target, source, special, 0, 4, 4, 5);
+ break;
+ case CL_A:
+ // target = {0, 0, 0, source->x}
+ shuffle4(target, source, special, 4, 4, 4, 0);
+ break;
+ case CL_INTENSITY:
+ // target = {source->x, source->x, source->x, source->x}
+ shuffle4(target, source, source, 0, 0, 0, 0);
+ break;
+ case CL_LUMINANCE:
+ // target = {source->x, source->x, source->x, t_max}
+ shuffle4(target, source, special, 0, 0, 0, 5);
+ break;
+ case CL_RG:
+ case CL_RGx:
+ // target = {source->x, source->y, 0, t_max}
+ shuffle4(target, source, special, 0, 1, 4, 5);
+ break;
+ case CL_RA:
+ // target = {source->x, 0, 0, source->y}
+ shuffle4(target, source, special, 0, 4, 4, 1);
+ break;
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ // Nothing to do, already the good order
+ std::memcpy(target, source, 16);
+ break;
+ case CL_ARGB:
+ // target = {source->y, source->z, source->w, source->x}
+ shuffle4(target, source, source, 1, 2, 3, 0);
+ break;
+ case CL_BGRA:
+ // target = {source->z, source->y, source->x, source->w}
+ shuffle4(target, source, source, 2, 1, 0, 3);
+ break;
+ }
+ }
+ else
+ {
+ switch (order)
+ {
+ case CL_A:
+ // target = {source->w, undef, undef, undef}
+ shuffle4(target, source, source, 3, 3, 3, 3);
+ break;
+ case CL_RA:
+ // target = {source->x, source->w, undef, undef}
+ shuffle4(target, source, source, 0, 3, 3, 3);
+ break;
+ case CL_ARGB:
+ // target = {source->w, source->x, source->y, source->z}
+ shuffle4(target, source, source, 3, 0, 1, 2);
+ break;
+ case CL_BGRA:
+ // target = {source->z, source->y, source->x, source->w}
+ shuffle4(target, source, source, 2, 1, 0, 3);
+ break;
+ default:
+ std::memcpy(target, source, 16);
+ }
+ }
+}
+
+/*
+ * Actual implementation of the built-ins
+ */
+
+void *CPUKernelWorkGroup::getImageData(Image2D *image, int x, int y, int z) const
+{
+ CPUBuffer *buffer =
+ (CPUBuffer *)image->deviceBuffer((DeviceInterface *)p_kernel->device());
+
+ return imageData((unsigned char *)buffer->data(),
+ x, y, z,
+ image->row_pitch(),
+ image->slice_pitch(),
+ image->pixel_size());
+}
+
+template<typename T>
+void CPUKernelWorkGroup::writeImageImpl(Image2D *image, int x, int y, int z,
+ T *color) const
+{
+ T converted[4];
+
+ // Swizzle to the correct order (float, int and uint are 32-bit, so the
+ // type has no importance
+ swizzle((uint32_t *)converted, (uint32_t *)color,
+ image->format().image_channel_order, false, 0);
+
+ // Get a pointer in the image where to write the data
+ void *target = getImageData(image, x, y, z);
+
+ // Convert color to the correct format
+ convert_to_format(target,
+ converted,
+ image->format().image_channel_data_type,
+ image->channels());
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ float *color) const
+{
+ writeImageImpl<float>(image, x, y, z, color);
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ int32_t *color) const
+{
+ writeImageImpl<int32_t>(image, x, y, z, color);
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ uint32_t *color) const
+{
+ writeImageImpl<uint32_t>(image, x, y, z, color);
+}
+
+template<typename T>
+uint32_t type_max_value()
+{
+ return 0;
+}
+
+template<>
+uint32_t type_max_value<float>()
+{
+ return 1065353216; // 1.0f in decimal form
+}
+
+template<>
+uint32_t type_max_value<int32_t>()
+{
+ return 0x7fffffff;
+}
+
+template<>
+uint32_t type_max_value<uint32_t>()
+{
+ return 0xffffffff;
+}
+
+template<typename T>
+void CPUKernelWorkGroup::readImageImplI(T *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ // Handle the addressing mode of the sampler
+ if (handle_address_mode(image, x, y, z, sampler))
+ {
+ // Border color
+ result[0] = 0.0f;
+ result[1] = 0.0f;
+ result[2] = 0.0f;
+
+ switch (image->format().image_channel_order)
+ {
+ case CL_R:
+ case CL_RG:
+ case CL_RGB:
+ case CL_LUMINANCE:
+ result[3] = 1.0f;
+ break;
+ default:
+ result[3] = 0.0f;
+ }
+
+ return;
+ }
+
+ // Load the data from the image, converting it
+ void *source = getImageData(image, x, y, z);
+ T converted[4];
+
+ convert_from_format(converted,
+ source,
+ image->format().image_channel_data_type,
+ image->channels());
+
+ // Swizzle the pixel just read and place it in result
+ swizzle((uint32_t *)result, (uint32_t *)converted,
+ image->format().image_channel_order, true, type_max_value<T>());
+}
+
+void CPUKernelWorkGroup::readImage(float *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<float>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<int32_t>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<uint32_t>(result, image, x, y, z, sampler);
+}
+
+template<typename T>
+void CPUKernelWorkGroup::readImageImplF(T *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ bool is_3d = (image->type() == MemObject::Image3D);
+ Image3D *image3d = (Image3D *)image;
+
+ int w = image->width(),
+ h = image->height(),
+ d = (is_3d ? image3d->depth() : 1);
+
+ switch (sampler & 0xf0)
+ {
+ case CLK_ADDRESS_NONE:
+ case CLK_ADDRESS_CLAMP:
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ /* De-normalize coordinates */
+ if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE)
+ {
+ x *= (float)w;
+ y *= (float)h;
+ if (is_3d) z *= (float)d;
+ }
+
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ readImageImplI<T>(result, image, std::floor(x),
+ std::floor(y), std::floor(z), sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c,
+ std::floor(x - 0.5f),
+ std::floor(y - 0.5f),
+ std::floor(z - 0.5f),
+ std::floor(x - 0.5f) + 1,
+ std::floor(y - 0.5f) + 1,
+ std::floor(z - 0.5f) + 1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c,
+ std::floor(x - 0.5f),
+ std::floor(y - 0.5f),
+ std::floor(x - 0.5f) + 1,
+ std::floor(y - 0.5f) + 1,
+ image);
+ }
+ }
+ }
+ break;
+ case CLK_ADDRESS_REPEAT:
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ int i, j, k;
+
+ x = (x - std::floor(x)) * (float)w;
+ i = std::floor(x);
+ if (i > w - 1)
+ i = i - w;
+
+ y = (y - std::floor(y)) * (float)h;
+ j = std::floor(y);
+ if (j > h - 1)
+ j = j - h;
+
+ if (is_3d)
+ {
+ z = (z - std::floor(z)) * (float)d;
+ k = std::floor(z);
+ if (k > d - 1)
+ k = k - d;
+ }
+
+ readImageImplI<T>(result, image, i, j, k, sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+ int i0, i1, j0, j1, k0, k1;
+
+ x = (x - std::floor(x)) * (float)w;
+ i0 = std::floor(x - 0.5f);
+ i1 = i0 + 1;
+ if (i0 < 0)
+ i0 = w + i0;
+ if (i1 > w - 1)
+ i1 = i1 - w;
+
+ y = (y - std::floor(y)) * (float)h;
+ j0 = std::floor(y - 0.5f);
+ j1 = j0 + 1;
+ if (j0 < 0)
+ j0 = h + j0;
+ if (j1 > h - 1)
+ j1 = j1 - h;
+
+ if (is_3d)
+ {
+ z = (z - std::floor(z)) * (float)d;
+ k0 = std::floor(z - 0.5f);
+ k1 = k0 + 1;
+ if (k0 < 0)
+ k0 = d + k0;
+ if (k1 > d - 1)
+ k1 = k1 - d;
+ }
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+ }
+ }
+ }
+ break;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+ y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+ if (is_3d)
+ z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+
+ readImageImplI<T>(result, image,
+ min(std::floor(x), w - 1),
+ min(std::floor(y), h - 1),
+ min(std::floor(z), d - 1),
+ sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+ int i0, i1, j0, j1, k0, k1;
+
+ x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+ i0 = std::floor(x - 0.5f);
+ i1 = i0 + 1;
+ i0 = max(i0, 0);
+ i1 = min(i1, w - 1);
+
+ y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+ j0 = std::floor(y - 0.5f);
+ j1 = j0 + 1;
+ j0 = max(j0, 0);
+ j1 = min(j1, h - 1);
+
+ if (is_3d)
+ {
+ z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+ k0 = std::floor(z - 0.5f);
+ k1 = k0 + 1;
+ k0 = max(k0, 0);
+ k1 = min(k1, d - 1);
+ }
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+ }
+ }
+ }
+ break;
+ }
+}
+
+void CPUKernelWorkGroup::readImage(float *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<float>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<int32_t>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<uint32_t>(result, image, x, y, z, sampler);
+}
diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp
new file mode 100644
index 0000000..e5251f2
--- /dev/null
+++ b/src/core/cpu/worker.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/worker.cpp
+ * \brief Code running in the worker threads launched by \c Coal::CPUDevice
+ * \sa builtins.cpp
+ */
+
+#include "worker.h"
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "builtins.h"
+
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+
+#include <sys/mman.h>
+
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+void *worker(void *data)
+{
+ CPUDevice *device = (CPUDevice *)data;
+ bool stop = false;
+ cl_int errcode;
+ Event *event;
+
+ // Initialize TLS
+ setWorkItemsData(0, 0);
+
+ while (true)
+ {
+ event = device->getEvent(stop);
+
+ // Ensure we have a good event and we don't have to stop
+ if (stop) break;
+ if (!event) continue;
+
+ // Get info about the event and its command queue
+ Event::Type t = event->type();
+ CommandQueue *queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ errcode = CL_SUCCESS;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Start);
+
+ // Execute the action
+ switch (t)
+ {
+ case Event::ReadBuffer:
+ case Event::WriteBuffer:
+ {
+ ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event;
+ CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(device);
+ char *data = (char *)buf->data();
+
+ data += e->offset();
+
+ if (t == Event::ReadBuffer)
+ std::memcpy(e->ptr(), data, e->cb());
+ else std::memcpy(data, e->ptr(), e->cb());
+
+ break;
+ }
+ case Event::CopyBuffer:
+ {
+ CopyBufferEvent *e = (CopyBufferEvent *)event;
+ CPUBuffer *src = (CPUBuffer *)e->source()->deviceBuffer(device);
+ CPUBuffer *dst = (CPUBuffer *)e->destination()->deviceBuffer(device);
+
+ std::memcpy((char*)dst->data() + e->dst_offset(),
+ (char*)src->data() + e->src_offset(), e->cb());
+ break;
+ }
+ case Event::ReadBufferRect:
+ case Event::WriteBufferRect:
+ case Event::CopyBufferRect:
+ case Event::ReadImage:
+ case Event::WriteImage:
+ case Event::CopyImage:
+ case Event::CopyBufferToImage:
+ case Event::CopyImageToBuffer:
+ {
+ // src = buffer and dst = mem if note copy
+ ReadWriteCopyBufferRectEvent *e = (ReadWriteCopyBufferRectEvent *)event;
+ CPUBuffer *src_buf = (CPUBuffer *)e->source()->deviceBuffer(device);
+
+ unsigned char *src = (unsigned char *)src_buf->data();
+ unsigned char *dst;
+
+ switch (t)
+ {
+ case Event::CopyBufferRect:
+ case Event::CopyImage:
+ case Event::CopyImageToBuffer:
+ case Event::CopyBufferToImage:
+ {
+ CopyBufferRectEvent *cbre = (CopyBufferRectEvent *)e;
+ CPUBuffer *dst_buf =
+ (CPUBuffer *)cbre->destination()->deviceBuffer(device);
+
+ dst = (unsigned char *)dst_buf->data();
+ break;
+ }
+ default:
+ {
+ // dst = host memory location
+ ReadWriteBufferRectEvent *rwbre = (ReadWriteBufferRectEvent *)e;
+
+ dst = (unsigned char *)rwbre->ptr();
+ }
+ }
+
+ // Iterate over the lines to copy and use memcpy
+ for (size_t z=0; z<e->region(2); ++z)
+ {
+ for (size_t y=0; y<e->region(1); ++y)
+ {
+ unsigned char *s;
+ unsigned char *d;
+
+ d = imageData(dst,
+ e->dst_origin(0),
+ y + e->dst_origin(1),
+ z + e->dst_origin(2),
+ e->dst_row_pitch(),
+ e->dst_slice_pitch(),
+ 1);
+
+ s = imageData(src,
+ e->src_origin(0),
+ y + e->src_origin(1),
+ z + e->src_origin(2),
+ e->src_row_pitch(),
+ e->src_slice_pitch(),
+ 1);
+
+ // Copying and image to a buffer may need to add an offset
+ // to the buffer address (its rectangular origin is
+ // always (0, 0, 0)).
+ if (t == Event::CopyBufferToImage)
+ {
+ CopyBufferToImageEvent *cptie = (CopyBufferToImageEvent *)e;
+ s += cptie->offset();
+ }
+ else if (t == Event::CopyImageToBuffer)
+ {
+ CopyImageToBufferEvent *citbe = (CopyImageToBufferEvent *)e;
+ d += citbe->offset();
+ }
+
+ if (t == Event::WriteBufferRect || t == Event::WriteImage)
+ std::memcpy(s, d, e->region(0)); // Write dest (memory) in src
+ else
+ std::memcpy(d, s, e->region(0)); // Write src (buffer) in dest (memory), or copy the buffers
+ }
+ }
+
+ break;
+ }
+ case Event::MapBuffer:
+ case Event::MapImage:
+ // All was already done in CPUBuffer::initEventDeviceData()
+ break;
+
+ case Event::NativeKernel:
+ {
+ NativeKernelEvent *e = (NativeKernelEvent *)event;
+ void (*func)(void *) = (void (*)(void *))e->function();
+ void *args = e->args();
+
+ func(args);
+
+ break;
+ }
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *)event;
+ CPUKernelEvent *ke = (CPUKernelEvent *)e->deviceData();
+
+ // Take an instance
+ CPUKernelWorkGroup *instance = ke->takeInstance();
+ ke = 0; // Unlocked, don't use anymore
+
+ if (!instance->run())
+ errcode = CL_INVALID_PROGRAM_EXECUTABLE;
+
+ delete instance;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ // Cleanups
+ if (errcode == CL_SUCCESS)
+ {
+ bool finished = true;
+
+ if (event->type() == Event::NDRangeKernel ||
+ event->type() == Event::TaskKernel)
+ {
+ CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData();
+ finished = ke->finished();
+ }
+
+ if (finished)
+ {
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus(Event::Complete);
+ }
+ }
+ else
+ {
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ // The event failed
+ event->setStatus((Event::Status)errcode);
+ }
+ }
+
+ // Free mmapped() data if needed
+ size_t mapped_size;
+ void *mapped_data = getWorkItemsData(mapped_size);
+
+ if (mapped_data)
+ munmap(mapped_data, mapped_size);
+
+ return 0;
+}
diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h
new file mode 100644
index 0000000..43ddd03
--- /dev/null
+++ b/src/core/cpu/worker.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file worker.h
+ * \brief Function run by the CPU worker threads
+ */
+
+#ifndef __CPU_WORKER_H__
+#define __CPU_WORKER_H__
+
+/**
+ * \brief Main loop of the CPU worker threads
+ *
+ * This function is run by as many thread as they are CPU cores on the host
+ * system. As explained by \ref events , this function waits until there
+ * are \c Coal::Event objects to process and handle them.
+ */
+void *worker(void *data);
+
+#endif
diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h
new file mode 100644
index 0000000..a321a9e
--- /dev/null
+++ b/src/core/deviceinterface.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file deviceinterface.h
+ * \brief Abstraction layer between Clover core and the devices
+ */
+
+#ifndef __DEVICEINTERFACE_H__
+#define __DEVICEINTERFACE_H__
+
+#include <CL/cl.h>
+#include <string>
+#include "object.h"
+
+/* This pulls in legacy::PassManager when LLVM >= 3.4 */
+#include <llvm/PassManager.h>
+
+namespace Coal
+{
+
+class DeviceBuffer;
+class DeviceProgram;
+class DeviceKernel;
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+/**
+ * \brief Abstraction layer between core Clover objects and the devices
+ *
+ * This interface is used by the core Clover classes to communicate with the
+ * devices, that must reimplement all the functions described here.
+ */
+class DeviceInterface : public Object
+{
+ public:
+ DeviceInterface() : Object(Object::T_Device, 0) {}
+ virtual ~DeviceInterface() {}
+
+ /**
+ * \brief Retrieve information about the device
+ *
+ * This function is used to retrieve information about an object.
+ * Sometimes, the size of the data retrieved is unknown (for example, a
+ * string). The application can call this function twice, the first time
+ * to get the size, then it allocates a buffer, and finally get the data.
+ *
+ * \code
+ * const char *string = 0;
+ * size_t len;
+ *
+ * object->info(FOO_PROPERTY_STRING, 0, 0, &len);
+ * string = std::malloc(len);
+ * object->info(FOO_PROPERTY_STRING, len, string, 0);
+ * \endcode
+ *
+ * \param param_name Name of the property to retrieve
+ * \param param_value_size Size of the application-allocated buffer
+ * in which to put the value.
+ * \param param_value Pointer to an application-allocated buffer
+ * where the property data will be stored. Ignored
+ * if NULL.
+ * \param param_value_size_ret Size of the value retrieved, ignored if
+ * NULL.
+ * \return CL_SUCCESS in case of success, otherwise a CL error code.
+ */
+ virtual cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceBuffer object for this device
+ * \param buffer Memory object for which the buffer has to be created
+ * \param rs Error code (\c CL_SUCCESS if no error)
+ * \return a \c Coal::DeviceBuffer object, undefined if there is an error
+ */
+ virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceProgram object for this device
+ * \param program \c Coal::Program containing the device-independent
+ * program data
+ * \return a \c Coal::DeviceProgram object
+ */
+ virtual DeviceProgram *createDeviceProgram(Program *program) = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceKernel object for this device
+ * \param kernel \c Coal::Kernel containing the device-independent kernel
+ * data
+ * \param function device-specific \c llvm::Function to be used
+ * \return a \c Coal::DeviceKernel object
+ */
+ virtual DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function) = 0;
+
+ /**
+ * \brief Push an event on the device
+ * \sa the end of \ref events
+ * \param event the event to be pushed
+ */
+ virtual void pushEvent(Event *event) = 0;
+
+ /**
+ * \brief Initialize device-specific event data
+ *
+ * This call allows a device to initialize device-specific event data,
+ * by using \c Coal::Event::setDeviceData(). For instance, an
+ * hardware-accelerated device can associate a device command to an
+ * event, and use it to manage the event when it gets pushed.
+ *
+ * @note This function has one obligation: it must call
+ * \c Coal::MapBufferEvent::setPtr() and
+ * \c Coal::MapImageEvent::setPtr() (and other function described
+ * in its documentation)
+ *
+ * \param event the event for which data can be set
+ * \return CL_SUCCESS in case of success
+ */
+ virtual cl_int initEventDeviceData(Event *event) = 0;
+
+ /**
+ * \brief Free device-specific event data
+ *
+ * This function is called just before \p event gets deleted. It allows
+ * a device to free device-specific data of this event, if any.
+ *
+ * \param event the event that will be destroyed
+ */
+ virtual void freeEventDeviceData(Event *event) = 0;
+
+ virtual std::string builtinsHeader(void) const = 0;
+
+ virtual void init() = 0;
+
+ /**
+ * \brief Ask device if it has enough work in its queue
+ */
+ virtual bool gotEnoughToWorkOn() { return false; }
+};
+
+/**
+ * \brief Device-specific memory buffer
+ *
+ * This class is the backing-store used on a device for a \c Coal::MemObject. It
+ * is created by \c Coal::DeviceInterface::createDeviceBuffer().
+ */
+class DeviceBuffer
+{
+ public:
+ DeviceBuffer() {}
+ virtual ~DeviceBuffer() {}
+
+ /**
+ * \brief Allocate the buffer on the device
+ * \return true when success, false otherwise
+ */
+ virtual bool allocate() = 0;
+
+ /**
+ * \brief \c Coal::DeviceInterface of this buffer
+ * \return parent \c Coal::DeviceInterface
+ */
+ virtual DeviceInterface *device() const = 0;
+
+ /**
+ * \brief Allocation status
+ * \return true if already allocated, false otherwise
+ */
+ virtual bool allocated() const = 0;
+
+ /**
+ * \brief Host-accessible memory pointer
+ *
+ * This function returns what is passed as arguments to native kernels
+ * (\c clEnqueueNativeKernel(), \c Coal::NativeKernelEvent) in place of
+ * \c Coal::MemObject pointers.
+ *
+ * For \c Coal::CPUDevice, it's simply a pointer in RAM, but
+ * hardware-accelerated devices may need to do some copying or mapping.
+ *
+ * \warning Beware that this data may get written to by the native kernel.
+ *
+ * \return A memory pointer usable by a host native kernel
+ */
+ virtual void *nativeGlobalPointer() const = 0;
+};
+
+/**
+ * \brief Device-specific program data
+ */
+class DeviceProgram
+{
+ public:
+ DeviceProgram() {}
+ virtual ~DeviceProgram() {}
+
+ /**
+ * \brief Linking or not \b stdlib with this program
+ *
+ * \b stdlib is a LLVM bitcode file containing some implementations of
+ * OpenCL C built-ins. This function allows a device to tell
+ * \c Coal::Program::build() if it wants \b stdlib to be linked or not.
+ *
+ * Linking the library may allow inlining of functions like \c ceil(),
+ * \c floor(), \c clamp(), etc. So, if these functions are not better
+ * handled by the device itself than by \b stdlib, it's a good thing
+ * to link it.
+ *
+ * But if the device provides instructions for these functions, then
+ * it could be better not to link \b stdlib and to replace the LLVM
+ * calls to these functions with device-specific instructions.
+ *
+ * \warning \b Stdlib currently only works for \c Coal::CPUDevice, as
+ * it contains host-specific code (LLVM IR is not meant to be
+ * portable, pointer size changes for example).
+ *
+ * \return true if \b stdlib must be linked with the program
+ */
+ virtual bool linkStdLib() const = 0;
+
+ /**
+ * \brief Create device-specific optimization passes
+ *
+ * This hook allows a device to add LLVM optimization passes to a
+ * \c llvm::PassManager . This way, devices needing function flattening
+ * or special analysis passes can have them run on the mode.
+ *
+ * \param manager \c llvm::PassManager to which add the passes
+ * \param optimize false if \c -cl-opt-disable was given at compilation
+ * time.
+ */
+ virtual void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false) = 0;
+
+ /**
+ * \brief Build a device-specific representation of the program
+ *
+ * This function is called by \c Coal::Program::build() when the module
+ * is compiled and linked. It can be used by the device to build a
+ * device-specific representation of the program.
+ *
+ * \param module \c llvm::Module containing the program's LLVM IR
+ * \param binary_str \c std::string containing dep.unlinked_binary
+ * \return true in case of success, false otherwise
+ */
+ virtual bool build(llvm::Module *module, std::string* binary_str) = 0;
+
+ /**
+ * \brief Extract binaries from MIXED binary
+ *
+ * This function is called to extract LLVM bitcode from the native
+ * binary in the MIXED binary.
+ * \param binary_str \c std::string containing mixed binary
+ * \param bitcode \c std::string returns LLVM bitcode if not NULL
+ * \param native \c std::string returns native binary if not NULL
+ * \return true if the binary is indeed mixed
+ */
+ virtual bool ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native)
+ { return false; }
+};
+
+/**
+ * \brief Device-specific kernel data
+ */
+class DeviceKernel
+{
+ public:
+ DeviceKernel() {}
+ virtual ~DeviceKernel() {}
+
+ /**
+ * \brief Maximum work-group size of a kernel
+ * \return Maximum work-group size of the kernel based on device-specific
+ * data such as memory usage, register pressure, etc)
+ */
+ virtual size_t workGroupSize() = 0;
+
+ /**
+ * \brief Local memory used by the kernel
+ * \return Local memory used by the kernel, in bytes
+ */
+ virtual cl_ulong localMemSize() const = 0;
+
+ /**
+ * \brief Private memory used by the kernel
+ * \return Private memory used by the kernel, in bytes
+ */
+ virtual cl_ulong privateMemSize() const = 0;
+
+ /**
+ * \brief Preferred work-group size multiple
+ * \return The size multiple a work-group can have to work the best and
+ * the fastest on the device
+ */
+ virtual size_t preferredWorkGroupSizeMultiple() const = 0;
+
+ /**
+ * \brief Optimal work-group size
+ *
+ * This function allows a device to calculate the optimal work-group size
+ * for this kernel, using it's memory usage, SIMD dimension, etc.
+ *
+ * \c Coal::CPUDevice tries to split the kernel into a number of
+ * work-groups the closest possible to the number of CPU cores.
+ *
+ * \param num_dims Number of working dimensions
+ * \param dim Dimension for which the multiple is being calculated
+ * \param global_work_size Total number of work-items to split into
+ * work-groups
+ * \return optimal size of a work-group, for the \p dim dimension.
+ */
+ virtual size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const = 0;
+};
+
+}
+
+struct _cl_device_id : public Coal::DeviceInterface
+{};
+
+#endif
diff --git a/src/core/dsp/buffer.cpp b/src/core/dsp/buffer.cpp
new file mode 100644
index 0000000..72c5419
--- /dev/null
+++ b/src/core/dsp/buffer.cpp
@@ -0,0 +1,149 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "buffer.h"
+#include "device.h"
+#include "driver.h"
+
+#include "CL/cl_ext.h"
+#include "../memobject.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+DSPBuffer::DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs)
+ : DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0),
+ p_data_malloced(false), p_buffer_idx(0)
+{
+ if (buffer->type() != MemObject::SubBuffer &&
+ buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ /*---------------------------------------------------------------------
+ * We use the host ptr, we are already allocated
+ *--------------------------------------------------------------------*/
+ p_data = (DSPDevicePtr64)(uint64_t)buffer->host_ptr();
+ }
+}
+
+DSPBuffer::~DSPBuffer()
+{
+ if (p_data_malloced)
+ {
+ if (p_buffer->flags() & CL_MEM_USE_MSMC_TI)
+ p_device->free_msmc (p_data);
+ else p_device->free_global(p_data);
+ }
+}
+
+DSPDevicePtr64 DSPBuffer::data() const
+{
+ if (!p_data && p_buffer->type() == MemObject::SubBuffer)
+ {
+ /*---------------------------------------------------------------------
+ * Data is based on the DSPBuffer of the parent buffer
+ *--------------------------------------------------------------------*/
+ SubBuffer *subbuf = (SubBuffer *)p_buffer;
+ MemObject *parent = subbuf->parent();
+ DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device);
+
+ if (!parent_dspbuf->data()) parent_dspbuf->allocate();
+ if (!parent_dspbuf->data()) { return 0; } //ERROR()
+
+ return parent_dspbuf->data() + subbuf->offset();
+ }
+ else if (!p_data) ; // ERROR();
+
+ return p_data;
+}
+
+void *DSPBuffer::nativeGlobalPointer() const
+{
+ return (void*) (uint64_t) data();
+}
+
+bool DSPBuffer::allocate()
+{
+ size_t buf_size = p_buffer->size();
+
+ /*-------------------------------------------------------------------------
+ * Something went wrong...
+ *------------------------------------------------------------------------*/
+ if (buf_size == 0) return false;
+
+ if (!p_data && p_buffer->type() == MemObject::SubBuffer)
+ {
+ /*---------------------------------------------------------------------
+ * Data is based on the DSPBuffer of the parent buffer
+ *--------------------------------------------------------------------*/
+ SubBuffer *subbuf = (SubBuffer *)p_buffer;
+ MemObject *parent = subbuf->parent();
+ DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device);
+
+ if (!parent_dspbuf->data()) parent_dspbuf->allocate();
+ if (!parent_dspbuf->data()) return false;
+
+ p_data = parent_dspbuf->data() + subbuf->offset();
+ return true;
+ }
+
+ /*-------------------------------------------------------------------------
+ * We not using a host ptr, allocate a buffer
+ *------------------------------------------------------------------------*/
+ if (!p_data)
+ {
+ if (p_buffer->flags() & CL_MEM_USE_MSMC_TI)
+ p_data = (DSPDevicePtr64) p_device->malloc_msmc(buf_size);
+ else p_data = (DSPDevicePtr64) p_device->malloc_global(buf_size, false);
+
+ if (!p_data) return false;
+
+ p_data_malloced = true;
+ }
+
+ if (p_buffer->type() != MemObject::SubBuffer &&
+ p_buffer->flags() & CL_MEM_COPY_HOST_PTR)
+ Driver::instance()->write(p_device->dspID(), p_data,
+ (uint8_t*)p_buffer->host_ptr(), buf_size);
+
+ // Say to the memobject that we are allocated
+ p_buffer->deviceAllocated(this);
+
+ return true;
+}
+
+DeviceInterface *DSPBuffer::device() const
+{
+ return p_device;
+}
+
+bool DSPBuffer::allocated() const
+{
+ return p_data != 0;
+}
diff --git a/src/core/dsp/buffer.h b/src/core/dsp/buffer.h
new file mode 100644
index 0000000..b8cb860
--- /dev/null
+++ b/src/core/dsp/buffer.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef __DSP_BUFFER_H__
+#define __DSP_BUFFER_H__
+
+#include "../deviceinterface.h"
+#include "device.h"
+
+namespace Coal
+{
+
+class DSPDevice;
+class MemObject;
+
+class DSPBuffer : public DeviceBuffer
+{
+ public:
+ DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs);
+ ~DSPBuffer();
+
+ bool allocate();
+ DeviceInterface *device() const;
+ DSPDevicePtr64 data() const ;
+ void *nativeGlobalPointer() const ;
+ bool allocated() const;
+
+ private:
+ DSPDevice * p_device;
+ MemObject * p_buffer;
+ DSPDevicePtr64 p_data;
+ bool p_data_malloced;
+ unsigned int p_buffer_idx;
+};
+}
+#endif
diff --git a/src/core/dsp/cmem.cpp b/src/core/dsp/cmem.cpp
new file mode 100644
index 0000000..ee0f938
--- /dev/null
+++ b/src/core/dsp/cmem.cpp
@@ -0,0 +1,271 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cmem.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+Cmem* Cmem::pInstance = 0;
+
+/*=============================================================================
+* C M E M
+*============================================================================*/
+#define CEIL_DIVIDE(x,y) (((x) + (y) - 1) / y)
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Cmem* Cmem::instance()
+{
+ static Mutex Cmem_instance_mutex;
+ Cmem* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cmem_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Cmem;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Cmem::open()
+******************************************************************************/
+void Cmem::open()
+{
+ int status = cmem_drv_open();
+ ERR(status, "DMA Contiguous Memory Driver Open Error");
+
+ status = cmem_drv_free(0, HOST_BUF_TYPE_DYNAMIC, buf_desc);
+ ERR(status, "DMA Contiguous Memory Free Error");
+
+ status = cmem_drv_alloc(MAX_NUM_HOST_DSP_BUFFERS, HOST_CMEM_BUFFER_SIZE,
+ HOST_BUF_TYPE_DYNAMIC, buf_desc);
+ ERR(status, "DMA Contiguous Memory Alloc Error");
+
+ status = bufmgrCreate(&DmaBufPool, MAX_NUM_HOST_DSP_BUFFERS, buf_desc);
+ ERR(status, "DMA Buffer manager Create Error");
+}
+
+/******************************************************************************
+* Cmem::close()
+******************************************************************************/
+void Cmem::close()
+{
+ bufmgrDelete(&DmaBufPool);
+
+ int status = cmem_drv_free(MAX_NUM_HOST_DSP_BUFFERS, HOST_BUF_TYPE_DYNAMIC,
+ buf_desc);
+ ERR(status, "DMA Contiguous Memory Driver Free Error");
+
+ status = cmem_drv_close();
+ ERR(status, "DMA Contiguous Memory Driver Close Error");
+}
+
+
+/******************************************************************************
+* The dma to the dsp memory system can only occur from contiguous memory, i.e.
+* cmem. CMEM buffers are currently limited to 4M, the algorithm is to
+* copy the general buffer in 4M chunks into CMEM 4M buffers. Then we are able
+* to chain 2 4M buffer writes per DMA initiate. As a result, we will have
+* ceil ( size / 8M ) dma transfers initiated by the routine. to make it
+* concrete at 48M buffer dma, will result in:
+* 12 memcpy calls of 4M each,
+* 12 CMEM buffers allocated of 4M each
+* 6 dma_initiates each with 2 - 4M buffers
+*
+* The algorithm is based one the MAX_CONTIGUOUS_XFER_BUFFERS and
+* HOST_CMEM_BUFFER_SIZE macros. Currently they are 2 and 4M.
+******************************************************************************/
+void Cmem::dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size)
+{
+ static uint32_t trans_id = 0;
+ uint32_t start_trans_id = trans_id;
+ int32_t ret_val;
+ std::deque<uint32_t> dma_ids;
+
+ uint32_t simul_dmas = 4;
+ uint32_t cmem_buffer_size = HOST_CMEM_BUFFER_SIZE;
+ uint32_t tot_buffers = CEIL_DIVIDE(size, cmem_buffer_size);
+ uint32_t circ_buffers = std::min(simul_dmas, tot_buffers);
+ uint32_t last_buffer_size = size - ((tot_buffers-1) * cmem_buffer_size);
+
+ cmem_host_buf_desc_t *host_buf_desc =
+ new cmem_host_buf_desc_t[circ_buffers];
+
+ cmem_host_frame_desc_t *host_frame_desc =
+ new cmem_host_frame_desc_t[circ_buffers];
+
+ /*---------------------------------------------------------------------
+ * Allocate Host CMEM buffers
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < circ_buffers; i++)
+ {
+ ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc[i]);
+ ERR(ret_val, "dma buffer allocation failed");
+ host_frame_desc[i].bufDescP = &host_buf_desc[i];
+ host_frame_desc[i].numBuffers = 1;
+ host_frame_desc[i].frameStartOffset = 0;
+ host_frame_desc[i].frameSize = cmem_buffer_size;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initiate one transfer at a time based on what fits within the allowed
+ * contiguous buffers per DMA transaction
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < tot_buffers; ++i)
+ {
+ int circ_i = i % simul_dmas;
+ int offset = i * cmem_buffer_size;
+
+ cmem_host_buf_desc_t &buf_desc = host_buf_desc[circ_i];
+ uint32_t cpy_size = buf_desc.length;
+
+ if (i == tot_buffers-1)
+ host_frame_desc[circ_i].frameSize = cpy_size = last_buffer_size;
+
+ memcpy(buf_desc.userAddr, buf + offset, cpy_size);
+
+ /*---------------------------------------------------------------------
+ * Initiate DMA
+ *--------------------------------------------------------------------*/
+ ret_val = pciedrv_dma_write_initiate(dsp_id, addr + offset,
+ &host_frame_desc[circ_i],
+ PCIEDRV_DMA_XFER_NON_BLOCKING,
+ &trans_id);
+ ERR(ret_val, "DMA initiate failed");
+
+ dma_ids.push_back(trans_id);
+
+ if (dma_ids.size() >= simul_dmas)
+ {
+ while (pciedrv_dma_check(dsp_id, dma_ids.front()));
+ dma_ids.pop_front();
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Wait for all dmas to complete
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < dma_ids.size(); i++)
+ while (pciedrv_dma_check(dsp_id, dma_ids[i]));
+
+ /*---------------------------------------------------------------------
+ * Free host CMEM buffers
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < circ_buffers; i++)
+ {
+ ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc[i]);
+ ERR(ret_val, "dma buffer free failed");
+ }
+
+ delete [] host_buf_desc;
+ delete [] host_frame_desc;
+}
+
+/******************************************************************************
+* Cmem::dma_read
+******************************************************************************/
+void Cmem::dma_read(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size)
+{
+ cmem_host_buf_desc_t host_buf_desc;
+ cmem_host_frame_desc_t host_frame_desc;
+
+ /*-------------------------------------------------------------------------
+ * Calculate total number of host buffers required to fit the data
+ *------------------------------------------------------------------------*/
+ uint32_t num_buffers = CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE);
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+ uint32_t transfer_size = HOST_CMEM_BUFFER_SIZE;
+ uint32_t trans_id;
+ int32_t ret_val;
+
+ /*---------------------------------------------------------------------
+ * Allocate Host buffer
+ *--------------------------------------------------------------------*/
+ ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc);
+ ERR(ret_val, "dma buffer allocation failed");
+
+ /*---------------------------------------------------------------------
+ * Populate details of data in frame descriptor
+ *--------------------------------------------------------------------*/
+ host_frame_desc.bufDescP = &host_buf_desc;
+ host_frame_desc.numBuffers = 1;
+ host_frame_desc.frameStartOffset = 0;
+ host_frame_desc.frameSize = transfer_size;
+
+ /*-------------------------------------------------------------------------
+ * Initiate one transfer at a time based on what fits within the allowed
+ *------------------------------------------------------------------------*/
+ while (num_buffers)
+ {
+ if (num_buffers == 1)
+ {
+ transfer_size = remaining_size;
+ host_frame_desc.frameSize = transfer_size;
+ }
+
+ /*---------------------------------------------------------------------
+ * Initiate DMA
+ *--------------------------------------------------------------------*/
+ ret_val = pciedrv_dma_read_initiate(dsp_id, addr + offset,
+ &host_frame_desc, PCIEDRV_DMA_XFER_BLOCKING, &trans_id);
+ ERR(ret_val, "DMA initiate failed");
+
+ /*---------------------------------------------------------------------
+ * Copy from dma buffers into buffer
+ *--------------------------------------------------------------------*/
+ memcpy (buf + offset, host_buf_desc.userAddr, transfer_size);
+
+ num_buffers--;
+ offset += transfer_size;
+ remaining_size -= transfer_size;
+ }
+
+ /*---------------------------------------------------------------------
+ * Free Buffer Descriptors
+ *--------------------------------------------------------------------*/
+ ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc);
+ ERR(ret_val, "dma buffer free failed");
+}
diff --git a/src/core/dsp/cmem.h b/src/core/dsp/cmem.h
new file mode 100644
index 0000000..24a6de0
--- /dev/null
+++ b/src/core/dsp/cmem.h
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _CMEM_H
+#define _CMEM_H
+#include "u_lockable.h"
+
+extern "C"
+{
+ #include "pciedrv.h"
+ #include "cmem_drv.h"
+ #include "bufmgr.h"
+}
+
+#define HOST_CMEM_BUFFER_SIZE 0x400000 // 4M
+#define MAX_NUM_HOST_DSP_BUFFERS 128
+
+class Cmem : public Lockable_off
+{
+ public:
+ ~Cmem() { close(); }
+ static Cmem* instance ();
+
+ void open();
+ void close();
+ void dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size);
+ void dma_read (int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size);
+
+ private:
+ static Cmem* pInstance;
+
+ cmem_host_buf_desc_t buf_desc[MAX_NUM_HOST_DSP_BUFFERS];
+ void * DmaBufPool;
+
+ Cmem() : DmaBufPool(NULL) { open(); }
+ Cmem(const Cmem&); // copy ctor disallowed
+ Cmem& operator=(const Cmem&); // assignment disallowed
+};
+
+#endif // _CMEM_H
diff --git a/src/core/dsp/core_scheduler.h b/src/core/dsp/core_scheduler.h
new file mode 100644
index 0000000..58d0555
--- /dev/null
+++ b/src/core/dsp/core_scheduler.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "u_lockable.h"
+#ifndef _CORE_SCHEDULER_H
+#define _CORE_SCHEDULER_H
+
+class CoreScheduler : public Lockable
+{
+ public:
+ CoreScheduler() : p_avail(0xff) {}
+
+ void free(int core)
+ {
+ Lock lock(this);
+ p_avail |= (1 << core);
+ CV.notify_one();
+ }
+
+ int allocate()
+ {
+ Lock lock(this);
+
+ /*---------------------------------------------------------------------
+ * Wait in a loop in case the condvar is falsely signalled
+ *--------------------------------------------------------------------*/
+ while (!p_avail) CV.wait(lock.raw());
+
+ for (int i=0, mask = 1; i < 8; ++i, mask <<= 1)
+ if (p_avail & mask) { p_avail &= ~mask; return i; }
+ }
+
+ private:
+ unsigned char p_avail;
+ CondVar CV;
+};
+
+#endif //_CORE_SCHEDULER_H
diff --git a/src/core/dsp/database.h b/src/core/dsp/database.h
new file mode 100644
index 0000000..ca4d69e
--- /dev/null
+++ b/src/core/dsp/database.h
@@ -0,0 +1,112 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DATABASE_H__
+#define __DATABASE_H__
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sqlite3.h>
+
+using namespace std;
+
+class Database
+{
+ public:
+ Database(const char* filename) : database(NULL) { open(filename); }
+ ~Database() { close(); }
+
+ void close()
+ {
+ if (database) sqlite3_close(database);
+ database = NULL;
+ }
+
+ vector<vector<string> > query(const char* query)
+ {
+ sqlite3_stmt *statement;
+ vector<vector<string> > results;
+ const int retry_limit = 20;
+ int retries = 0;
+
+ int rc = sqlite3_prepare_v2(database, query, -1, &statement, 0);
+
+ while ((rc == SQLITE_BUSY || rc == SQLITE_LOCKED) &&
+ ++retries <= retry_limit)
+ {
+ sqlite3_finalize(statement);
+ usleep(100);
+ rc = sqlite3_prepare_v2(database, query, -1, &statement, 0);
+ }
+
+ if (rc == SQLITE_OK)
+ {
+ int cols = sqlite3_column_count(statement);
+ int result = 0;
+
+ while (true)
+ {
+ result = sqlite3_step(statement);
+
+ if (result == SQLITE_ROW)
+ {
+ vector<string> values;
+ for (int col = 0; col < cols; col++)
+ values.push_back((char*)sqlite3_column_text(statement,col));
+ results.push_back(values);
+ }
+ else break;
+ }
+
+ sqlite3_finalize(statement);
+ }
+
+ string error = sqlite3_errmsg(database);
+ if (error != "not an error")
+ std::cout << query << " " << error << std::endl;
+
+ return results;
+ }
+
+ private:
+ sqlite3 *database;
+
+ private:
+ bool open(const char* filename)
+ {
+ if (sqlite3_open(filename, &database) == SQLITE_OK)
+ {
+ sqlite3_busy_timeout(database, 1000);
+ return true;
+ }
+ return false;
+ }
+
+};
+
+#endif
diff --git a/src/core/dsp/device.cpp b/src/core/dsp/device.cpp
new file mode 100644
index 0000000..32cd9b0
--- /dev/null
+++ b/src/core/dsp/device.cpp
@@ -0,0 +1,1135 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "../platform.h"
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "program.h"
+#include <cstdlib>
+#include <algorithm>
+#include <limits.h>
+#include "CL/cl_ext.h"
+
+#include <core/config.h>
+#include "../propertylist.h"
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+#include "../program.h"
+#include "../util.h"
+
+#include "driver.h"
+#include "mailbox.h"
+
+extern "C"
+{
+ #include "dload_api.h"
+ #include <ti/runtime/mmap/include/mmap_resource.h>
+
+}
+
+#include <cstring>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+using namespace Coal;
+
+Mailbox* Mailbox::pInstance = 0;
+
+/******************************************************************************
+* On DSPC868X the mailboxes are remote on the device DDR. On Hawking the
+* mailboxes are in shared DDR
+******************************************************************************/
+#ifdef DSPC868X
+#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_REMOTE
+#else
+#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_LOCAL
+
+#include "shmem.h"
+unsigned dsp_speed()
+{
+ const unsigned DSP_PLL = 122880000;
+ const unsigned pagesize = 0x1000;
+
+ shmem_persistent bootcfg_page;
+ shmem_persistent clock_page;
+
+ bootcfg_page.configure(0x02620000, pagesize);
+ clock_page.configure(0x02310000, pagesize);
+
+ char *BOOTCFG_BASE_ADDR = (char*)bootcfg_page.map(0x02620000, pagesize);
+ char *CLOCK_BASE_ADDR = (char*)clock_page.map(0x02310000, pagesize);
+
+ int MAINPLLCTL0 = (*(int*)(BOOTCFG_BASE_ADDR + 0x350));
+ int MULT = (*(int*)(CLOCK_BASE_ADDR + 0x110));
+ int OUTDIV = (*(int*)(CLOCK_BASE_ADDR + 0x108));
+
+ unsigned mult = 1 + ((MULT & 0x3F) | ((MAINPLLCTL0 & 0x7F000) >> 6));
+ unsigned prediv = 1 + (MAINPLLCTL0 & 0x3F);
+ unsigned output_div = 1 + ((OUTDIV >> 19) & 0xF);
+ unsigned speed = DSP_PLL * mult / prediv / output_div;
+
+ bootcfg_page.unmap(BOOTCFG_BASE_ADDR, pagesize);
+ clock_page.unmap(CLOCK_BASE_ADDR, pagesize);
+
+ return speed / 1000000;
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Declare our threaded dsp handler function
+*----------------------------------------------------------------------------*/
+void *dsp_worker(void* data);
+void HOSTwait (unsigned char dsp_id);
+
+/******************************************************************************
+* DSPDevice::DSPDevice(unsigned char dsp_id)
+******************************************************************************/
+DSPDevice::DSPDevice(unsigned char dsp_id)
+ : DeviceInterface (),
+ p_cores (8),
+ p_num_events (0),
+ p_dsp_mhz (1000), // 1.00 GHz
+ p_worker (0),
+ p_rx_mbox (0),
+ p_tx_mbox (0),
+ p_stop (false),
+ p_initialized (false),
+ p_dsp_id (dsp_id),
+ p_device_msmc_heap(),
+ p_device_ddr_heap1(),
+ p_device_ddr_heap2(),
+ p_device_ddr_heap3(),
+ p_device_l2_heap (),
+ p_dload_handle (0),
+ p_complete_pending(),
+ p_mpax_default_res(NULL)
+{
+ Driver *driver = Driver::instance();
+
+ void *hdl = driver->reset_and_load(dsp_id);
+
+ p_addr_kernel_config = driver->get_symbol(hdl, "kernel_config_l2");
+ p_addr_local_mem = driver->get_symbol(hdl, "ocl_local_mem_start");
+ p_addr_mbox_d2h_phys = driver->get_symbol(hdl, "mbox_d2h_phys");
+ p_addr_mbox_h2d_phys = driver->get_symbol(hdl, "mbox_h2d_phys");
+ p_size_local_mem = driver->get_symbol(hdl, "ocl_local_mem_size");
+ p_size_mbox_d2h = driver->get_symbol(hdl, "mbox_d2h_size");
+ p_size_mbox_h2d = driver->get_symbol(hdl, "mbox_h2d_size");
+
+ /*-------------------------------------------------------------------------
+ * These 4 variables were previously retrieved from the monitor out file.
+ * They are now determined by query of the CMEM system.
+ *------------------------------------------------------------------------*/
+ //p_addr_global_mem = driver->get_symbol(hdl, "ocl_global_mem_start");
+ //p_addr_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_start");
+ //p_size_global_mem = driver->get_symbol(hdl, "ocl_global_mem_size");
+ //p_size_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_size");
+
+#if 0
+ // Adjust p_size_global_mem for PG1.0 board, monitor takes 2MB
+ #define MONITOR_MEM 2
+ uint32_t mem_reserve = parse_file_line_value("/proc/cmdline",
+ "mem_reserve=", 0);
+ if (mem_reserve > 0 && mem_reserve*1024*1024 < p_size_global_mem)
+ p_size_global_mem = (mem_reserve - MONITOR_MEM) * 1024 * 1024;
+
+ char *dsp_global_mem_size = getenv("TI_OCL_DSP_GLOBAL_MEM_SIZE");
+ if (dsp_global_mem_size)
+ p_size_global_mem = atol(dsp_global_mem_size);
+
+ // Ordering is important: global in CMEM block 0, msmc in CMEM block 1
+ driver->cmem_init(p_addr_global_mem, p_size_global_mem,
+ p_addr_msmc_mem, p_size_msmc_mem);
+#endif
+ p_addr64_global_mem = 0;
+ p_size64_global_mem = 0;
+ p_addr_msmc_mem = 0;
+ p_size_msmc_mem = 0;
+ DSPDevicePtr64 global3 = 0;
+ uint64_t gsize3 = 0;
+ driver->cmem_init(&p_addr64_global_mem, &p_size64_global_mem,
+ &p_addr_msmc_mem, &p_size_msmc_mem,
+ &global3, &gsize3);
+
+ DSPDevicePtr64 global1 = p_addr64_global_mem;
+ DSPDevicePtr64 global2 = 0;
+ uint64_t gsize1 = p_size64_global_mem;
+ uint64_t gsize2 = 0;
+ driver->split_ddr_memory(p_addr64_global_mem, p_size64_global_mem,
+ global1, gsize1, global2, gsize2, gsize3);
+
+ driver->shmem_configure(global1, gsize1, 0);
+ if (gsize2 > 0) driver->shmem_configure(global2, gsize2, 0);
+ if (gsize3 > 0) driver->shmem_configure(global3, gsize3, 0);
+ driver->shmem_configure(p_addr_msmc_mem, p_size_msmc_mem, 1);
+ driver->shmem_configure(p_addr_mbox_d2h_phys, p_size_mbox_d2h);
+ driver->shmem_configure(p_addr_mbox_h2d_phys, p_size_mbox_h2d);
+ for (int core=0; core < 8; core++)
+ driver->shmem_configure(((0x10 + core) << 24) + p_addr_local_mem,
+ p_size_local_mem);
+
+ driver->free_image_handle(hdl);
+
+ /*-------------------------------------------------------------------------
+ * Setup the DSP heaps for memory allocation
+ *------------------------------------------------------------------------*/
+ p_device_ddr_heap1.configure(global1, gsize1);
+ p_device_ddr_heap2.configure(global2, gsize2, true);
+ p_device_ddr_heap3.configure(global3, gsize3, true);
+ p_device_l2_heap.configure (p_addr_local_mem, p_size_local_mem);
+ p_device_msmc_heap.configure(p_addr_msmc_mem, p_size_msmc_mem);
+
+ /*-------------------------------------------------------------------------
+ * initialize the mailboxes on the cores, so they can receive an exit cmd
+ *------------------------------------------------------------------------*/
+ Mailbox* mb_instance = Mailbox::instance();
+
+ uint32_t mailboxallocsize = mpm_mailbox_get_alloc_size();
+
+ p_tx_mbox = (void*)malloc(mailboxallocsize);
+ p_rx_mbox = (void*)malloc(mailboxallocsize);
+
+ mpm_mailbox_config_t mbConfig;
+ mbConfig.mem_start_addr =
+ (uint32_t)driver->map(p_addr_mbox_h2d_phys, p_size_mbox_h2d);
+
+ mbConfig.mem_size = p_size_mbox_h2d;
+ mbConfig.max_payload_size = mbox_payload;
+
+ int tx_status = mb_instance->create(p_tx_mbox,
+ NULL,
+ MAILBOX_LOCATION,
+ MPM_MAILBOX_DIRECTION_SEND, &mbConfig);
+
+ mbConfig.mem_start_addr =
+ (uint32_t)driver->map(p_addr_mbox_d2h_phys, p_size_mbox_d2h);
+ mbConfig.mem_size = p_size_mbox_d2h;
+
+ int rx_status = mb_instance->create(p_rx_mbox,
+ NULL,
+ MAILBOX_LOCATION,
+ MPM_MAILBOX_DIRECTION_RECEIVE, &mbConfig);
+
+ tx_status |= mb_instance->open(p_tx_mbox);
+ rx_status |= mb_instance->open(p_rx_mbox);
+
+ if (tx_status != 0 || rx_status != 0)
+ std::cout << "Could not create mailboxes for dsp "
+ << p_dsp_id << std::endl;
+
+
+#ifdef DSPC868X
+ char *ghz1 = getenv("TI_OCL_DSP_1_25GHZ");
+ if (ghz1) p_dsp_mhz = 1250; // 1.25 GHz
+#else
+ mail_to(frequencyMsg);
+
+ int ret = 0;
+ do
+ {
+ while (!mail_query()) ;
+ ret = mail_from();
+ } while (ret == -1);
+
+ p_dsp_mhz = ret;
+#endif
+
+}
+
+
+/******************************************************************************
+* void DSPDevice::init()
+******************************************************************************/
+void DSPDevice::init()
+{
+ if (p_initialized) return;
+
+ /*-------------------------------------------------------------------------
+ * Initialize the locking machinery and create worker threads
+ *------------------------------------------------------------------------*/
+ pthread_cond_init(&p_events_cond, 0);
+ pthread_mutex_init(&p_events_mutex, 0);
+ pthread_create(&p_worker, 0, &dsp_worker, this);
+
+ p_initialized = true;
+}
+
+/******************************************************************************
+* DSPDevice::~DSPDevice()
+******************************************************************************/
+DSPDevice::~DSPDevice()
+{
+ /*-------------------------------------------------------------------------
+ * Inform the cores on the device to stop listening for commands
+ *------------------------------------------------------------------------*/
+ mail_to(exitMsg);
+
+ free (p_tx_mbox);
+ free (p_rx_mbox);
+
+ /*-------------------------------------------------------------------------
+ * Only need to close the driver for one of the devices
+ *------------------------------------------------------------------------*/
+ if (p_dsp_id == 0) Driver::instance()->close();
+
+ if (!p_initialized) return;
+
+ /*-------------------------------------------------------------------------
+ * Terminate the workers and wait for them
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_stop = true;
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+
+ pthread_join(p_worker, 0);
+
+ pthread_mutex_destroy(&p_events_mutex);
+ pthread_cond_destroy(&p_events_cond);
+}
+
+/******************************************************************************
+* DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer)
+******************************************************************************/
+DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs)
+ { return (DeviceBuffer *)new DSPBuffer(this, buffer, rs); }
+
+/******************************************************************************
+* DeviceProgram *DSPDevice::createDeviceProgram(Program *program)
+******************************************************************************/
+DeviceProgram *DSPDevice::createDeviceProgram(Program *program)
+ { return (DeviceProgram *)new DSPProgram(this, program); }
+
+/******************************************************************************
+* DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel,
+******************************************************************************/
+DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel,
+ llvm::Function *function)
+ { return (DeviceKernel *)new DSPKernel(this, kernel); }
+
+/******************************************************************************
+* cl_int DSPDevice::initEventDeviceData(Event *event)
+******************************************************************************/
+cl_int DSPDevice::initEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::MapBuffer:
+ {
+ MapBufferEvent *e = (MapBufferEvent*) event;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ e->setPtr((char*)e->buffer()->host_ptr() + e->offset());
+ break;
+ }
+
+ DSPBuffer *buf = (DSPBuffer*) e->buffer()->deviceBuffer(this);
+ DSPDevicePtr64 data = buf->data() + e->offset();
+
+ // DO NOT INVALIDATE! Here only initializes host_addr, it cannot
+ // be used before MapBuffer event is scheduled and processed!
+ void* host_addr = Driver::instance()->map(data, e->cb(), false);
+ e->setPtr(host_addr);
+ break;
+ }
+
+ case Event::MapImage: break;
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *)event;
+ Program *p = (Program *)e->kernel()->parent();
+ DSPProgram *prog = (DSPProgram *)p->deviceDependentProgram(this);
+
+ /*-----------------------------------------------------------------
+ * Just in time loading
+ *----------------------------------------------------------------*/
+ if (!prog->is_loaded() && !prog->load())
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+
+ DSPKernel *dspkernel = (DSPKernel*)e->deviceKernel();
+
+ cl_int ret = dspkernel->preAllocBuffers();
+ if (ret != CL_SUCCESS) return ret;
+
+ // ASW TODO do something
+
+ // Set device-specific data
+ DSPKernelEvent *dsp_e = new DSPKernelEvent(this, e);
+ e->setDeviceData((void *)dsp_e);
+ break;
+ }
+ default: break;
+ }
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void DSPDevice::freeEventDeviceData(Event *event)
+******************************************************************************/
+void DSPDevice::freeEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ DSPKernelEvent *dsp_e = (DSPKernelEvent *)event->deviceData();
+ if (dsp_e) delete dsp_e;
+ }
+ default: break;
+ }
+}
+
+/******************************************************************************
+* void DSPDevice::pushEvent(Event *event)
+******************************************************************************/
+void DSPDevice::pushEvent(Event *event)
+{
+ /*-------------------------------------------------------------------------
+ * Add an event in the list
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_events.push_back(event);
+ p_num_events++; // Way faster than STL list::size() !
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+}
+
+bool DSPDevice::stop() { return p_stop; }
+bool DSPDevice::availableEvent() { return p_num_events > 0; }
+
+/******************************************************************************
+* Event *DSPDevice::getEvent(bool &stop)
+******************************************************************************/
+Event *DSPDevice::getEvent(bool &stop)
+{
+ /*-------------------------------------------------------------------------
+ * Return the first event in the list, if any. Remove it if it is a
+ * single-shot event.
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ while (p_num_events == 0 && !p_stop)
+ pthread_cond_wait(&p_events_cond, &p_events_mutex);
+
+ if (p_stop)
+ {
+ pthread_mutex_unlock(&p_events_mutex);
+ stop = true;
+ return 0;
+ }
+
+ Event *event = p_events.front();
+ p_num_events--;
+ p_events.pop_front();
+
+ pthread_mutex_unlock(&p_events_mutex);
+
+ return event;
+}
+
+void DSPDevice::push_complete_pending(uint32_t idx, Event* const data)
+ { p_complete_pending.push(idx, data); }
+
+bool DSPDevice::get_complete_pending(uint32_t idx, Event*& data)
+ { return p_complete_pending.try_pop(idx, data); }
+
+void DSPDevice::dump_complete_pending() { p_complete_pending.dump(); }
+
+bool DSPDevice::any_complete_pending() { return !p_complete_pending.empty(); }
+
+/******************************************************************************
+* Device's decision about whether CommandQueue should push more events over
+* This number could be tuned (e.g. using ooo example). Note that p_num_events
+* are in device's queue, but not yet executed.
+******************************************************************************/
+bool DSPDevice::gotEnoughToWorkOn() { return p_num_events > 0; }
+
+/******************************************************************************
+* Getter functions
+******************************************************************************/
+unsigned int DSPDevice::numDSPs() const { return p_cores; }
+float DSPDevice::dspMhz() const { return p_dsp_mhz; }
+unsigned char DSPDevice::dspID() const { return p_dsp_id; }
+DLOAD_HANDLE DSPDevice::dload_handle() const { return p_dload_handle; }
+
+
+int DSPDevice::load(const char *filename)
+{
+ if (!p_dload_handle)
+ {
+ p_dload_handle = DLOAD_create((void*)this);
+ DLOAD_initialize(p_dload_handle);
+ }
+
+ FILE *fp = fopen(filename, "rb");
+ if (!fp) { printf("can't open OpenCL Program file\n"); exit(1); }
+
+ int prog_handle = DLOAD_load(p_dload_handle, fp);
+ fclose(fp);
+ return prog_handle;
+}
+
+bool DSPDevice::unload(int file_handle)
+{
+ if (p_dload_handle)
+ return DLOAD_unload(p_dload_handle, file_handle);
+ return false;
+}
+
+DSPDevicePtr DSPDevice::get_local_scratch(uint32_t &size, uint32_t &block_size)
+{
+ uint64_t size64;
+ DSPDevicePtr64 addr64 = p_device_l2_heap.max_block_size(size64, block_size);
+ size = (uint32_t) size64;
+ return (DSPDevicePtr) addr64;
+}
+
+DSPDevicePtr DSPDevice::malloc_local(size_t size)
+ { return p_device_l2_heap.malloc(size,true); }
+
+void DSPDevice::free_local(DSPDevicePtr addr)
+ { p_device_l2_heap.free(addr); }
+
+DSPDevicePtr DSPDevice::malloc_msmc(size_t size)
+ { return p_device_msmc_heap.malloc(size,true); }
+
+void DSPDevice::free_msmc(DSPDevicePtr addr)
+ { p_device_msmc_heap.free(addr); }
+
+// TODO: examine the flag, the logic, etc
+#define FRACTION_PERSISTENT_FOR_BUFFER 8
+DSPDevicePtr64 DSPDevice::malloc_global(size_t size, bool prefer_32bit)
+{
+ if (prefer_32bit) return p_device_ddr_heap1.malloc(size, true);
+
+ DSPDevicePtr64 addr = 0;
+ uint64_t size64 = 0;
+ uint32_t block_size;
+ p_device_ddr_heap1.max_block_size(size64, block_size);
+ if (size64 / size > FRACTION_PERSISTENT_FOR_BUFFER)
+ addr = p_device_ddr_heap1.malloc(size, true);
+ if (!addr)
+ // addr = Driver::instance()->cmem_ondemand_malloc(size);
+ addr = p_device_ddr_heap2.malloc(size, true);
+ if (!addr)
+ addr = p_device_ddr_heap3.malloc(size, true);
+ if (!addr)
+ addr = p_device_ddr_heap1.malloc(size, true); // give it another chance
+ return addr;
+}
+
+void DSPDevice::free_global(DSPDevicePtr64 addr)
+{
+ if (addr < DSP_36BIT_ADDR)
+ p_device_ddr_heap1.free(addr);
+ else
+ // Driver::instance()->cmem_ondemand_free(addr);
+ if (p_device_ddr_heap2.free(addr) == -1)
+ p_device_ddr_heap3.free(addr);
+}
+
+void DSPDevice::mail_to(Msg_t &msg)
+{
+ static unsigned trans_id = 0xC0DE0000;
+ Mailbox::instance()->write(p_tx_mbox, (uint8_t*)&msg, sizeof(Msg_t),
+ trans_id++);
+}
+
+bool DSPDevice::mail_query()
+{
+ return Mailbox::instance()->query(p_rx_mbox);
+}
+
+int DSPDevice::mail_from()
+{
+ uint32_t size_rx, trans_id_rx;
+ Msg_t rxmsg;
+
+ Mailbox::instance()->read(p_rx_mbox, (uint8_t*)&rxmsg, &size_rx,
+ &trans_id_rx);
+
+ if (rxmsg.command == ERROR)
+ {
+ printf("%s", rxmsg.u.message);
+ return -1;
+ }
+
+ if (rxmsg.command == PRINT)
+ {
+ printf("[core %c] %s", rxmsg.u.message[0], rxmsg.u.message+1);
+ return -1;
+ }
+
+ return trans_id_rx;
+}
+
+/******************************************************************************
+* void* DSPDevice::get_mpax_default_res, only need to be computed once
+******************************************************************************/
+void* DSPDevice::get_mpax_default_res()
+{
+ if (p_mpax_default_res == NULL)
+ {
+ p_mpax_default_res = malloc(sizeof(keystone_mmap_resources_t));
+ memset(p_mpax_default_res, 0, sizeof(keystone_mmap_resources_t));
+
+#define NUM_VIRT_HEAPS 2
+ uint32_t xmc_regs[MAX_XMCSES_MPAXS] = {3, 4, 5, 6, 7, 8, 9};
+ uint32_t ses_regs[MAX_XMCSES_MPAXS] = {1, 2, 3, 4, 5, 6, 7};
+ uint32_t heap_base[NUM_VIRT_HEAPS] = {0x80000000, 0xC0000000};
+ uint32_t heap_size[NUM_VIRT_HEAPS] = {0x20000000, 0x40000000};
+ for (int i = 0; i < MAX_XMCSES_MPAXS; i++)
+ {
+ xmc_regs[i] = FIRST_FREE_XMC_MPAX + i;
+ ses_regs[i] = FIRST_FREE_SES_MPAX + i;
+ }
+ keystone_mmap_resource_init(MAX_XMCSES_MPAXS, xmc_regs, ses_regs,
+ NUM_VIRT_HEAPS, heap_base, heap_size,
+ (keystone_mmap_resources_t *) p_mpax_default_res);
+
+ }
+ return p_mpax_default_res;
+}
+
+/******************************************************************************
+* cl_int DSPDevice::info
+******************************************************************************/
+cl_int DSPDevice::info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union
+ {
+ cl_device_type cl_device_type_var;
+ cl_uint cl_uint_var;
+ size_t size_t_var;
+ cl_ulong cl_ulong_var;
+ cl_bool cl_bool_var;
+ cl_device_fp_config cl_device_fp_config_var;
+ cl_device_mem_cache_type cl_device_mem_cache_type_var;
+ cl_device_local_mem_type cl_device_local_mem_type_var;
+ cl_device_exec_capabilities cl_device_exec_capabilities_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ cl_platform_id cl_platform_id_var;
+ size_t work_dims[MAX_WORK_DIMS];
+ };
+
+ uint64_t maxblock;
+ uint32_t dummy;
+
+ switch (param_name)
+ {
+ case CL_DEVICE_TYPE:
+ SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_ACCELERATOR);
+ break;
+
+ case CL_DEVICE_VENDOR_ID:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ SIMPLE_ASSIGN(cl_uint, numDSPs());
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS);
+ break;
+
+ /*-----------------------------------------------------------------
+ * Set to local mem size / 128 so that conf basic/local_kernel_def
+ * can allocate and pass. This allows a long16 for each wi to exist
+ * in local mem.
+ *----------------------------------------------------------------*/
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, 0xffffffff); //p_size_local_mem / 128);
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ for (int i=0; i<MAX_WORK_DIMS; ++i)
+ {
+ work_dims[i] = 0xffffffff; //p_size_local_mem / 128;
+ }
+ value_length = MAX_WORK_DIMS * sizeof(size_t);
+ value = &work_dims;
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 1);
+ break;
+
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ SIMPLE_ASSIGN(cl_uint, dspMhz());
+ break;
+
+ case CL_DEVICE_ADDRESS_BITS:
+ SIMPLE_ASSIGN(cl_uint, 32);
+ break;
+
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); // images not supported
+ break;
+
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, std::min(p_device_ddr_heap1.size(), (cl_ulong)1ul << 30));
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ SIMPLE_ASSIGN(size_t, 116); // ASW TODO - needs to be 1024
+ break;
+
+ case CL_DEVICE_MAX_SAMPLERS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ SIMPLE_ASSIGN(cl_uint, 1024); // 128 byte aligned
+ break;
+
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 128);
+ break;
+
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ // Currently don't support CL_FP_DENORM
+ // ASW TODO: Investigate others
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST);
+ break;
+
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
+ CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ SIMPLE_ASSIGN(cl_device_mem_cache_type, CL_READ_WRITE_CACHE);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 128);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, 128*1024);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap1.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT1_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap2.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT2_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap3.size());
+ break;
+
+ case CL_DEVICE_MSMC_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_msmc_heap.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap1.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT1_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap2.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT2_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap3.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_MSMC_MEM_MAX_ALLOC_TI:
+ p_device_msmc_heap.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_MAX_ALLOC_TI:
+ p_device_l2_heap.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, 64<<10);
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ SIMPLE_ASSIGN(cl_device_local_mem_type, CL_LOCAL);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, p_device_l2_heap.size());
+ break;
+
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ // ASW TODO - check answer
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 microsecond
+ break;
+
+ case CL_DEVICE_ENDIAN_LITTLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL);
+ break;
+
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE);
+ break;
+
+ case CL_DEVICE_NAME:
+ // ASW TODO add device number suffix
+#ifdef DSPC868X
+ STRING_ASSIGN("TI TMS320C6678 DSP");
+#else
+ STRING_ASSIGN("TI K2H DSP (8x C66)");
+#endif
+ break;
+
+ case CL_DEVICE_VENDOR:
+ STRING_ASSIGN("Texas Instruments, Inc.");
+ break;
+
+ case CL_DRIVER_VERSION:
+ STRING_ASSIGN("" COAL_VERSION);
+ break;
+
+ case CL_DEVICE_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_DEVICE_VERSION:
+ STRING_ASSIGN("OpenCL 1.1 TI " COAL_VERSION);
+ break;
+
+ case CL_DEVICE_EXTENSIONS:
+ STRING_ASSIGN("cl_khr_byte_addressable_store"
+ " cl_khr_global_int32_base_atomics"
+ " cl_khr_global_int32_extended_atomics"
+ " cl_khr_local_int32_base_atomics"
+ " cl_khr_local_int32_extended_atomics"
+ " cl_khr_fp64"
+ " cl_ti_msmc_buffers")
+ break;
+
+ case CL_DEVICE_PLATFORM:
+ SIMPLE_ASSIGN(cl_platform_id, &the_platform);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 1);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_OPENCL_C_VERSION:
+ STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* Call back functions from the target loader
+******************************************************************************/
+extern "C"
+{
+
+/*****************************************************************************/
+/* DLIF_ALLOCATE() - Return the load address of the segment/section */
+/* described in its parameters and record the run address in */
+/* run_address field of DLOAD_MEMORY_REQUEST. */
+/*****************************************************************************/
+BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *targ_req)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Get pointers to API segment and file descriptors. */
+ /*------------------------------------------------------------------------*/
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment;
+
+ uint32_t addr;
+
+ if (obj_desc->target_address >> 20 == 0x008)
+ addr = (uint32_t)device->malloc_local (obj_desc->memsz_in_bytes);
+ else if (obj_desc->target_address >> 24 == 0x0C)
+ addr = (uint32_t)device->malloc_msmc (obj_desc->memsz_in_bytes);
+ else addr = (uint32_t)device->malloc_global(obj_desc->memsz_in_bytes);
+
+#if DEBUG
+ printf("DLIF_allocate: %d bytes starting at 0x%x (relocated from 0x%x)\n",
+ obj_desc->memsz_in_bytes, (uint32_t)addr,
+ (uint32_t)obj_desc->target_address);
+#endif
+
+ obj_desc->target_address = (TARGET_ADDRESS) addr;
+
+ /*------------------------------------------------------------------------*/
+ /* Target memory request was successful. */
+ /*------------------------------------------------------------------------*/
+ return addr == 0 ? 0 : 1;
+}
+
+/*****************************************************************************/
+/* DLIF_RELEASE() - Unmap or free target memory that was previously */
+/* allocated by DLIF_allocate(). */
+/*****************************************************************************/
+BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+
+ if (ptr->target_address >> 20 == 0x008)
+ device->free_local ((DSPDevicePtr)ptr->target_address);
+ else if (ptr->target_address >> 24 == 0x0C)
+ device->free_msmc ((DSPDevicePtr)ptr->target_address);
+ else device->free_global((DSPDevicePtr)ptr->target_address);
+
+#if DEBUG
+ printf("DLIF_free: %d bytes starting at 0x%x\n",
+ ptr->memsz_in_bytes, (uint32_t)ptr->target_address);
+#endif
+
+ return 1;
+}
+
+/*****************************************************************************/
+/* DLIF_WRITE() - Write updated (relocated) segment contents to target */
+/* memory. */
+/*****************************************************************************/
+BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req)
+{
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = req->segment;
+ DSPDevice* device = (DSPDevice*) client_handle;
+ int dsp_id = device->dspID();
+
+ Driver::instance()->write (dsp_id,
+ (uint32_t)obj_desc->target_address,
+ (uint8_t*)req->host_address,
+ obj_desc->memsz_in_bytes);
+
+#if DEBUG
+ printf("DLIF_write (dsp:%d): %d bytes starting at 0x%x\n",
+ dsp_id, obj_desc->memsz_in_bytes,
+ (uint32_t)obj_desc->target_address);
+#endif
+
+ extern DSPProgram::segment_list *segments;
+
+ if (segments) segments->push_back
+ (DSPProgram::seg_desc((DSPDevicePtr)obj_desc->target_address, obj_desc->memsz_in_bytes, req->flags));
+
+ return 1;
+}
+
+/******************************************************************************
+* DLIF_LOAD_DEPENDENT()
+******************************************************************************/
+int DLIF_load_dependent(void* client_handle, const char* so_name)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+ FILE* fp = fopen(so_name, "rb");
+
+ if (!fp)
+ {
+ DLIF_error(DLET_FILE, "Can't open dependent file '%s'.\n", so_name);
+ return 0;
+ }
+
+ int to_ret = DLOAD_load(device->dload_handle(), fp);
+
+ if (!to_ret)
+ DLIF_error(DLET_MISC, "Failed load of dependent file '%s'.\n", so_name);
+
+ fclose(fp);
+ return to_ret;
+}
+
+/******************************************************************************
+* DLIF_UNLOAD_DEPENDENT()
+******************************************************************************/
+void DLIF_unload_dependent(void* client_handle, uint32_t file_handle)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+ DLOAD_unload(device->dload_handle(), file_handle);
+}
+
+}
+
+void dump_hex(char *addr, int bytes)
+{
+ int cnt = 0;
+
+ printf("\n");
+ while (cnt < bytes)
+ {
+ for (int col = 0; col < 16; ++col)
+ {
+ printf("%02x ", addr[cnt++] & 0xff);
+ if (cnt >= bytes) break;
+ }
+ printf("\n");
+ }
+}
+
diff --git a/src/core/dsp/device.h b/src/core/dsp/device.h
new file mode 100644
index 0000000..4a6f32a
--- /dev/null
+++ b/src/core/dsp/device.h
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_DEVICE_H__
+#define __DSP_DEVICE_H__
+
+extern "C" {
+#include "dload_api.h"
+}
+
+#include "../deviceinterface.h"
+#include "dspheap.h"
+#include "message.h"
+#include "u_concurrent_map.h"
+#include "kernel.h"
+#include <pthread.h>
+#include <string>
+#include <list>
+
+namespace Coal
+{
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+class DSPDevice : public DeviceInterface
+{
+ public:
+ DSPDevice(unsigned char dsp_id);
+ ~DSPDevice();
+
+ void init();
+
+ cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
+ DeviceProgram *createDeviceProgram(Program *program);
+ DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function);
+
+ cl_int initEventDeviceData(Event *event);
+ void freeEventDeviceData(Event *event);
+
+ void pushEvent(Event *event);
+ bool stop();
+ bool availableEvent();
+ Event *getEvent(bool &stop);
+
+ unsigned int numDSPs() const;
+ float dspMhz() const;
+ unsigned char dspID() const;
+ DLOAD_HANDLE dload_handle() const;
+
+ int load(const char *filename);
+ bool unload(int file_handle);
+
+ /*---------------------------------------------------------------------
+ * These malloc routines return a uint32_t instead of a pointer
+ * Because the target memory space is 32 bit and is independent of the
+ * size of a host pointer (ie. 32bit vs 64 bit)
+ * Device/Target global memory could be 36-bit.
+ * get_local_scratch returns max local free block for per kernel use.
+ *--------------------------------------------------------------------*/
+ DSPDevicePtr get_local_scratch(uint32_t &size, uint32_t &block_size);
+ DSPDevicePtr malloc_local (size_t size);
+ void free_local (DSPDevicePtr add);
+ DSPDevicePtr malloc_msmc (size_t size);
+ void free_msmc (DSPDevicePtr add);
+ DSPDevicePtr64 malloc_global(size_t size, bool prefer_32bit=true);
+ void free_global (DSPDevicePtr64 add);
+
+ void mail_to (Msg_t& msg);
+ bool mail_query();
+ int mail_from ();
+
+ void push_complete_pending(uint32_t idx, class Event* const data);
+ bool get_complete_pending(uint32_t idx, class Event* &data);
+ void dump_complete_pending();
+ bool any_complete_pending();
+ bool gotEnoughToWorkOn();
+
+ std::string builtinsHeader(void) const { return "dsp.h"; }
+
+ DSPDevicePtr get_addr_kernel_config() { return p_addr_kernel_config; }
+ void* get_mpax_default_res();
+
+ private:
+ unsigned int p_cores;
+ unsigned int p_num_events;
+ float p_dsp_mhz;
+ pthread_t p_worker;
+ void* p_rx_mbox; // int
+ void* p_tx_mbox;
+ std::list<Event *> p_events;
+ pthread_cond_t p_events_cond;
+ pthread_mutex_t p_events_mutex;
+ bool p_stop;
+ bool p_initialized;
+ unsigned char p_dsp_id;
+ dspheap p_device_ddr_heap1; // persistently mapped memory
+ dspheap p_device_ddr_heap2; // ondemand mapped memory
+ dspheap p_device_ddr_heap3; // addl ondemand mapped memory
+ dspheap p_device_l2_heap;
+ dspheap p_device_msmc_heap;
+ DLOAD_HANDLE p_dload_handle;
+ concurrent_map<uint32_t, class Event*> p_complete_pending;
+
+ DSPDevicePtr p_addr_kernel_config;
+ DSPDevicePtr64 p_addr64_global_mem;
+ DSPDevicePtr p_addr_local_mem;
+ DSPDevicePtr p_addr_msmc_mem;
+ DSPDevicePtr p_addr_mbox_d2h_phys;
+ DSPDevicePtr p_addr_mbox_h2d_phys;
+ uint64_t p_size64_global_mem;
+ uint32_t p_size_local_mem;
+ uint32_t p_size_msmc_mem;
+ uint32_t p_size_mbox_d2h;
+ uint32_t p_size_mbox_h2d;
+ void* p_mpax_default_res;
+};
+}
+#endif
diff --git a/src/core/dsp/driver.cpp b/src/core/dsp/driver.cpp
new file mode 100644
index 0000000..08e97f7
--- /dev/null
+++ b/src/core/dsp/driver.cpp
@@ -0,0 +1,34 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifdef DSPC868X
+#include "driver_shannon.cpp"
+#include "cmem.cpp"
+#else
+#include "driver_hawking.cpp"
+#include "shmem.cpp"
+#endif
diff --git a/src/core/dsp/driver.h b/src/core/dsp/driver.h
new file mode 100644
index 0000000..1e41a28
--- /dev/null
+++ b/src/core/dsp/driver.h
@@ -0,0 +1,100 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _DRIVER_H
+#define _DRIVER_H
+#include <vector>
+#include "u_lockable.h"
+#include "device.h"
+
+#ifdef DSPC868X
+extern "C"
+{
+ #include "pciedrv.h"
+ #include "dnldmgr.h"
+ #include "cmem_drv.h"
+ #include "bufmgr.h"
+}
+#else
+#include "shmem.h"
+#endif
+
+class Driver : public Lockable_off
+{
+ public:
+ ~Driver() { close(); }
+ int32_t num_dsps() const { return pNum_dsps; }
+ int32_t close();
+
+ int32_t write(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz);
+ int32_t read (int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz);
+
+ void* reset_and_load (int chip);
+ void free_image_handle(void *handle);
+ void cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3);
+ void cmem_exit();
+ DSPDevicePtr64 cmem_ondemand_malloc(uint64_t size);
+ void cmem_ondemand_free (DSPDevicePtr64 addr);
+ void split_ddr_memory (DSPDevicePtr64 addr, uint64_t size,
+ DSPDevicePtr64& addr1, uint64_t& size1,
+ DSPDevicePtr64& addr2, uint64_t& size2,
+ uint64_t& size3);
+ void shmem_configure (DSPDevicePtr64 addr, uint64_t size,
+ int cmem_block = -1);
+ void* map (DSPDevicePtr64 addr, uint32_t sz,
+ bool is_read = false);
+ int32_t unmap (void *host_addr, DSPDevicePtr64 buf_addr,
+ uint32_t sz, bool is_write = false);
+ DSPDevicePtr get_symbol(void* image_handle, const char *name);
+
+ static Driver* instance ();
+
+ private:
+ static Driver* pInstance;
+ int32_t pNum_dsps;
+
+#ifdef DSPC868X
+ pciedrv_open_config_t config;
+ pciedrv_device_info_t *pDevices_info;
+#else
+ std::vector<shmem*> pShmem_areas;
+ shmem* get_memory_region(DSPDevicePtr64 addr);
+#endif
+
+ int32_t open ();
+ bool wait_for_ready(int chip);
+ int32_t write_core(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t sz);
+
+ Driver() { open(); }
+ Driver(const Driver&); // copy ctor disallowed
+ Driver& operator=(const Driver&); // assignment disallowed
+};
+
+#endif // _DRIVER_H
diff --git a/src/core/dsp/driver_hawking.cpp b/src/core/dsp/driver_hawking.cpp
new file mode 100644
index 0000000..7cb2857
--- /dev/null
+++ b/src/core/dsp/driver_hawking.cpp
@@ -0,0 +1,451 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "driver.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+#include <bfd.h>
+
+extern "C"
+{
+ #include "mpmclient.h"
+};
+
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC
+#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC)
+
+Driver* Driver::pInstance = 0;
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Driver* Driver::instance ()
+{
+ static Mutex Driver_instance_mutex;
+ Driver* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Driver_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Driver;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Convert pci data into a recognizable board name for a device
+******************************************************************************/
+const char *get_board(unsigned switch_device)
+{
+ switch (switch_device)
+ {
+ case 0x8624: return "dspc8681";
+ case 0x8748: return "dspc8682";
+ default : ERR(1, "Unsupported device"); return "unknown";
+ }
+}
+
+#define TOTAL_NUM_CORES_PER_CHIP 8
+
+/******************************************************************************
+* wait_for_ready
+******************************************************************************/
+bool Driver::wait_for_ready(int chip) { return true; }
+
+static void report_core_state(const char *curr_core)
+{
+#if 0
+ char state[50];
+ int ret;
+ mpm_slave_state_e core_state;
+
+ ret = mpm_state(curr_core, &core_state);
+ if ( ret < 0)
+ printf("state query failed, %s\n", curr_core);
+
+ switch (core_state)
+ {
+ case mpm_slave_state_idle: sprintf(state, "idle"); break;
+ case mpm_slave_state_loaded: sprintf(state, "loaded"); break;
+ case mpm_slave_state_running: sprintf(state, "running"); break;
+ case mpm_slave_state_crashed: sprintf(state, "crashed"); break;
+ case mpm_slave_state_error: sprintf(state, "in error"); break;
+
+ default: sprintf(state, "in undefined state"); break;
+ }
+
+ printf("DSP core state: %s is %s\n", curr_core, state);
+#endif
+}
+
+void *Driver::reset_and_load(int chip)
+{
+ int ret;
+ int error_code = 0;
+ int error_code_msg[50];
+ char curr_core[10];
+
+ std::string get_ocl_dsp();
+ std::string monitor = get_ocl_dsp() + "/dsp.out";
+
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5, "dsp%d", core);
+
+ ret = mpm_reset(curr_core, &error_code);
+ if ( ret < 0)
+ printf("reset failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+// JKN Update ERR to handle error_code
+ ERR (ret, "DSP out of reset failed");
+
+ report_core_state(curr_core);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Load monitor on the devices
+ *------------------------------------------------------------------------*/
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5,"dsp%d", core);
+ ret = mpm_load(curr_core, const_cast<char*>(monitor.c_str()),
+ &error_code);
+ if ( ret < 0)
+ printf("load failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+ ERR(ret, "Download image failed");
+
+ report_core_state(curr_core);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Run monitor on the devices
+ *------------------------------------------------------------------------*/
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5,"dsp%d", core);
+ ret = mpm_run(curr_core, &error_code);
+ if ( ret < 0)
+ printf("run failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+ ERR(ret, "DSP run failed");
+
+ report_core_state(curr_core);
+ }
+
+ bfd *dsp_bfd = bfd_openr(monitor.c_str(), NULL);
+ char** matching;
+ char *ptr;
+
+ if(dsp_bfd == NULL)
+ {
+ printf("\nERROR:driver: %s Error Open image %s\n",
+ bfd_errmsg(bfd_get_error()), monitor.c_str());
+ exit(-1);
+ }
+ /* Check format with matching */
+ if (!bfd_check_format_matches (dsp_bfd, bfd_object, &matching))
+ {
+ fprintf(stderr, "\nERROR:driver %s: %s\n", monitor.c_str(),
+ bfd_errmsg(bfd_get_error()));
+ if (bfd_get_error () == bfd_error_file_ambiguously_recognized)
+ {
+ for (ptr = *matching; ptr != NULL; ptr++)
+ {
+ printf("%s: \n", ptr);
+ exit(-1);
+ }
+ free (matching);
+ }
+ }
+
+ return (void *)dsp_bfd;
+}
+
+/******************************************************************************
+* Driver::open
+******************************************************************************/
+int32_t Driver::open()
+{
+ Lock lock(this);
+
+ pNum_dsps = 1;
+
+ return 0;
+}
+
+/******************************************************************************
+* Driver::close()
+******************************************************************************/
+int32_t Driver::close()
+{
+ Lock lock(this);
+
+ while (!pShmem_areas.empty()) delete pShmem_areas.back(), pShmem_areas.pop_back();
+
+ cmem_exit();
+ return 0;
+}
+
+void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3)
+{
+ shmem_cmem_persistent::cmem_init(addr1, size1, addr2, size2, addr3, size3);
+}
+
+void Driver::cmem_exit()
+{
+ shmem_cmem_persistent::cmem_exit();
+}
+
+DSPDevicePtr64 Driver::cmem_ondemand_malloc(uint64_t size)
+{
+ return shmem_cmem_ondemand::cmem_malloc(size);
+}
+
+void Driver::cmem_ondemand_free(DSPDevicePtr64 addr)
+{
+ shmem_cmem_ondemand::cmem_free(addr);
+}
+
+/******************************************************************************
+* Driver::split_ddr_heap: partition DDR to persistent mapping part (heap1)
+* and on demand mapping part (heap2)
+******************************************************************************/
+void Driver::split_ddr_memory(DSPDevicePtr64 addr, uint64_t size,
+ DSPDevicePtr64& addr1, uint64_t& size1,
+ DSPDevicePtr64& addr2, uint64_t& size2,
+ uint64_t& size3)
+{
+ addr1 = addr;
+ size1 = size;
+ addr2 = 0;
+ size2 = 0;
+
+
+ // split ddr memory 1 into two chunks
+ if (getenv("TI_OCL_DSP_NOMAP") != NULL)
+ {
+ size3 = 0;
+ }
+ else if (addr + size > ALL_PERSISTENT_MAX_DSP_ADDR ||
+ (size3 > 0 && addr + size > MPAX_USER_MAPPED_DSP_ADDR))
+ {
+ size2 = addr + size - MPAX_USER_MAPPED_DSP_ADDR;
+ size1 = size - size2;
+ addr2 = addr + size1;
+ }
+
+ // translate first chunk to using 32-bit aliased physical addresses
+ if (addr > DSP_36BIT_ADDR)
+ {
+ addr1 = addr + 0xA0000000 - 0x820000000ULL;
+ /*---------------------------------------------------------------------
+ * if the ddr size is greater than we can currently support, limit it
+ *--------------------------------------------------------------------*/
+ //const int ddr_size_limit = (1.5 * 1024*1024*1024) - (48 *1024*1024);
+ const uint64_t ddr_size_limit = ALL_PERSISTENT_MAX_DSP_ADDR - addr;
+ if (size1 > ddr_size_limit)
+ size1 = ddr_size_limit;
+ }
+}
+
+void Driver::shmem_configure(DSPDevicePtr64 addr, uint64_t size, int cmem_block)
+{
+ if (size <= 0) return;
+
+ shmem *area;
+ if (addr >= MPAX_USER_MAPPED_DSP_ADDR)
+ area = new shmem_cmem_ondemand();
+ else if (cmem_block >= 0)
+ area = new shmem_cmem_persistent(cmem_block);
+ else
+ area = new shmem_persistent();
+
+ area->configure(addr, size);
+ pShmem_areas.push_back(area);
+}
+
+/******************************************************************************
+* Driver::get_memory_region
+******************************************************************************/
+shmem* Driver::get_memory_region(DSPDevicePtr64 addr)
+{
+
+ for (int i = 0; i < pShmem_areas.size(); ++i)
+ {
+ uint64_t end_exclusive = (uint64_t)pShmem_areas[i]->start() +
+ pShmem_areas[i]->size();
+
+ if (addr >= pShmem_areas[i]->start() && addr < end_exclusive)
+ return pShmem_areas[i];
+ }
+
+ printf("Illegal memory region: addr = 0x%llx\n", addr);
+ exit(-1);
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ int core;
+ /*-------------------------------------------------------------------------
+ * if the write is to L2, then write for each core
+ *------------------------------------------------------------------------*/
+ if ((addr >> 20) == 0x008)
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size);
+ else write_core(dsp_id, addr, buf, size);
+}
+
+/******************************************************************************
+* Driver::write_core
+******************************************************************************/
+int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ Lock lock(this);
+
+ shmem* region = get_memory_region(addr);
+ void* dst_host_addr = region->map(addr, size, false);
+ if (dst_host_addr) memcpy((char*)dst_host_addr, buf, size);
+ else ERR(1, "Unable to map dsp addr for write");
+ region->unmap(dst_host_addr, size, true);
+
+ return 0;
+}
+
+void* Driver::map(DSPDevicePtr64 addr, uint32_t sz, bool is_read)
+{
+ Lock lock(this);
+ shmem* region = get_memory_region(addr);
+ void* host_addr = region->map(addr, sz, is_read);
+ if (host_addr == NULL) ERR(1, "Unable to map a dsp address");
+ return host_addr;
+}
+
+int32_t Driver::unmap(void *host_addr, DSPDevicePtr64 buf_addr, uint32_t sz,
+ bool is_write)
+{
+ Lock lock(this);
+ shmem* region = get_memory_region(buf_addr);
+ region->unmap(host_addr, sz, is_write);
+ return 0;
+}
+
+/******************************************************************************
+* Driver::read
+******************************************************************************/
+int32_t Driver::read(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ Lock lock(this);
+
+ shmem* region = get_memory_region(addr);
+ void* dst_host_addr = region->map(addr, size, true);
+ if (dst_host_addr) memcpy(buf, (char*)dst_host_addr, size);
+ else ERR(1, "Unable to map dsp addr for read");
+ region->unmap(dst_host_addr, size, false);
+
+ return 0;
+}
+
+/******************************************************************************
+* Driver::free_image_handle
+******************************************************************************/
+void Driver::free_image_handle(void *handle)
+{
+ bfd_close((bfd*)handle);
+}
+
+/******************************************************************************
+* Driver::get_symbol
+******************************************************************************/
+DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name)
+{
+ DSPDevicePtr addr;
+ bfd* dsp_bfd;
+ uint32_t nsyms, nsize;
+ asymbol ** symtab;
+ symbol_info syminfo;
+ int i;
+
+ if (!image_handle)
+ {
+ std::cout << "ERROR: Failed to get image handle" << std::endl;
+ exit(-1);
+ }
+
+ dsp_bfd = (bfd *)image_handle;
+
+ /*-------------------------------------------------------------------------
+ * Find boot address and address of mpi_rank.
+ *------------------------------------------------------------------------*/
+ nsize = bfd_get_symtab_upper_bound (dsp_bfd);
+ if ((symtab = (asymbol**)malloc(nsize)) == NULL)
+ {
+ std::cout << "ERROR: Failed to malloc memory in get_symbol" << std::endl;
+ exit(-1);
+ }
+
+ nsyms = bfd_canonicalize_symtab(dsp_bfd, symtab);
+
+ for (i = 0; i < nsyms; i++)
+ if (strcmp(symtab[i]->name, name) == 0)
+ {
+ bfd_symbol_info(symtab[i], &syminfo);
+ DSPDevicePtr addr = syminfo.value;
+ free(symtab);
+
+ return addr;
+ }
+
+ free(symtab);
+ std::cout << "ERROR: Get symbol failed" << std::endl;
+ exit(-1);
+}
diff --git a/src/core/dsp/driver_shannon.cpp b/src/core/dsp/driver_shannon.cpp
new file mode 100644
index 0000000..b428dbb
--- /dev/null
+++ b/src/core/dsp/driver_shannon.cpp
@@ -0,0 +1,313 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "driver.h"
+#include "cmem.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC
+#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC)
+
+Driver* Driver::pInstance = 0;
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Driver* Driver::instance ()
+{
+ static Mutex Driver_instance_mutex;
+ Driver* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Driver_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Driver;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Convert pci data into a recognizable board name for a device
+******************************************************************************/
+const char *get_board(unsigned switch_device)
+{
+ switch (switch_device)
+ {
+ case 0x8624: return "dspc8681";
+ case 0x8748: return "dspc8682";
+ default : ERR(1, "Unsupported device"); return "unknown";
+ }
+}
+
+#define TOTAL_NUM_CORES_PER_CHIP 8
+
+/******************************************************************************
+* wait_for_ready
+******************************************************************************/
+bool Driver::wait_for_ready(int chip)
+{
+ int execution_wait_count = 0;
+ while (1)
+ {
+ int core;
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ uint32_t boot_entry_value;
+ int ret = pciedrv_dsp_read(chip,
+ ((0x10 + core) << 24) + BOOT_ENTRY_LOCATION_ADDR,
+ (unsigned char *) &boot_entry_value, 4);
+ ERR(ret, "pciedrv_dsp_read failed");
+
+ if (boot_entry_value != 0) break;
+ }
+
+ if (core == TOTAL_NUM_CORES_PER_CHIP) return true;
+ if (++execution_wait_count > 1000) return false;
+
+ usleep(1000);
+ }
+}
+
+char *get_ocl_install();
+void *Driver::reset_and_load(int chip)
+{
+ char *installation = get_ocl_install();
+
+ /*------------------------------------------------------------------------
+ * Determine DSP speed. 1 Ghz by default. Set Env Var for 1.25Ghz Oper
+ *-----------------------------------------------------------------------*/
+ uint32_t pll_multiplier = 0x00000014; // 1.00 Ghz by default
+ if (getenv("TI_OCL_DSP_1_25GHZ")) pll_multiplier = 0x00000019;
+
+ /*-------------------------------------------------------------------------
+ * Configure boot config
+ *------------------------------------------------------------------------*/
+ uint32_t bootcfg_words[]= { 0xBABEFACE, pll_multiplier };
+ boot_cfg_t bootcfg = { 0x86FF00, sizeof(bootcfg_words), bootcfg_words};
+
+ /*-------------------------------------------------------------------------
+ * reset the devices
+ *------------------------------------------------------------------------*/
+ int ret = dnldmgr_reset_dsp(chip, 0, NULL, 0 , NULL);
+ ERR (ret, "DSP putting in reset failed");
+
+ const char *board = get_board(pDevices_info[chip].switch_device);
+ std::string init(installation);
+ init += "/lib/init_";
+ init += board;
+ init += ".out";
+
+ void * image_handle;
+ uint32_t entry;
+
+ ret = dnldmgr_get_image(init.c_str(), &image_handle, &entry);
+ ERR(ret, "Get reset image failed");
+
+ ret = dnldmgr_reset_dsp(chip, 1, image_handle, entry, &bootcfg);
+ ERR (ret, "DSP out of reset failed");
+
+ dnldmgr_free_image(image_handle);
+
+ /*---------------------------------------------------------------------
+ * wait for reset to complete
+ *--------------------------------------------------------------------*/
+ ERR(!wait_for_ready(chip), "Reset Failed due to timeout");
+
+ /*-------------------------------------------------------------------------
+ * Load monitor on the devices
+ *------------------------------------------------------------------------*/
+ std::string monitor(installation);
+ monitor += "/lib/dsp.out";
+
+ ret = dnldmgr_get_image(monitor.c_str(), &image_handle, &entry);
+ ERR(ret, "Get DSP image failed");
+
+ ret = dnldmgr_load_image(chip, 0xFFFF, image_handle, entry, NULL);
+ ERR(ret, "Download image failed");
+
+ return image_handle;
+}
+
+/******************************************************************************
+* Driver::open
+******************************************************************************/
+int32_t Driver::open()
+{
+ Lock lock(this);
+
+ memset((void*)&config, 0, sizeof(pciedrv_open_config_t));
+ config.dsp_outbound_reserved_mem_size = 0;
+ config.start_dma_chan_num = 0;
+ config.num_dma_channels = 4;
+ config.start_param_set_num = 0;
+ config.num_param_sets = 32;
+ config.dsp_outbound_block_size = 0x400000;
+ config.max_dma_transactions = 256;
+
+ int status = pciedrv_open(&config);
+ ERR(status, "PCIe Driver Open Error");
+
+ pNum_dsps = pciedrv_get_num_devices();
+
+ /*-------------------------------------------------------------------------
+ * Allocate space for and retrieve device info
+ *------------------------------------------------------------------------*/
+ pDevices_info = (pciedrv_device_info_t*)
+ malloc(pNum_dsps * sizeof(pciedrv_device_info_t));
+ ERR (!pDevices_info, "malloc failed pciedrv_devices_info_t");
+
+ int ret = pciedrv_get_pci_info(pDevices_info);
+ ERR(ret, "get pci info failed");
+
+ Cmem::instance(); // Prime the setup of cmem
+ return 0;
+}
+
+/******************************************************************************
+* Driver::close()
+******************************************************************************/
+int32_t Driver::close()
+{
+ Lock lock(this);
+ free (pDevices_info);
+ int status = pciedrv_close();
+ ERR(status, "PCIe Driver Close Error");
+ return 0;
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ int core;
+ /*-------------------------------------------------------------------------
+ * if the write is to L2, then write for each core
+ *------------------------------------------------------------------------*/
+ if ((addr >> 20) == 0x008)
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size);
+ else write_core(dsp_id, addr, buf, size);
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ /*-------------------------------------------------------------------------
+ * Regular writes under 24k are faster than DMA writes (may change)
+ *------------------------------------------------------------------------*/
+ if (size < 24 * 1024)
+ {
+ int status = pciedrv_dsp_write(dsp_id, addr, buf, size);
+ ERR(status, "PCIe Driver Write Error");
+ return 0;
+ }
+
+ Lock lock(this);
+ Cmem::instance()->dma_write(dsp_id, addr, buf, size);
+ return 0;
+}
+
+void* Driver::map(DSPDevicePtr addr, uint32_t sz, bool is_read)
+{
+ return (void*) (uint64_t) addr;
+}
+
+int32_t Driver::unmap(void *host_addr, DSPDevicePtr buf_addr,
+ uint32_t sz, bool is_write)
+{
+}
+
+/******************************************************************************
+* Driver::read
+******************************************************************************/
+int32_t Driver::read(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ Cmem::instance()->dma_read(dsp_id, addr, buf, size);
+ return 0;
+}
+
+/******************************************************************************
+* Driver::get_symbol
+******************************************************************************/
+DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name)
+{
+ DSPDevicePtr addr;
+ int ret = dnldmgr_get_symbol_address(image_handle, name, &addr);
+ if (ret) { printf("ERROR: Get symbol failed\n"); exit(-1); }
+
+ return addr;
+}
+
+/******************************************************************************
+* Driver::free_image_handle
+******************************************************************************/
+void Driver::free_image_handle(void *handle)
+{
+ dnldmgr_free_image(handle);
+}
+
+/******************************************************************************
+* Driver::cmem_setup
+* Driver::shmem_configure
+******************************************************************************/
+void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2)
+{
+}
+
+void Driver::cmem_exit()
+{
+}
+
+void Driver::shmem_configure(DSPDevicePtr addr, uint32_t size, int cmem_block)
+{
+}
+
diff --git a/src/core/dsp/dspheap.h b/src/core/dsp/dspheap.h
new file mode 100644
index 0000000..0668647
--- /dev/null
+++ b/src/core/dsp/dspheap.h
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+* @file dspheap.h
+*
+* @brief Define a dsp device heap manager run on the host.
+*
+* @version 1.00.00
+*
+******************************************************************************/
+#ifndef _DSPHEAP_H
+#define _DSPHEAP_H
+#include <map>
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include "u_lockable.h"
+#include "dspmem.h"
+
+#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1))
+#define MIN_BLOCK_SIZE 128
+#define MIN_CMEM_ONDEMAND_BLOCK_SIZE 4096
+
+class dspheap : public Lockable
+{
+ typedef std::map<DSPDevicePtr64, uint64_t> block_list;
+ typedef block_list::iterator block_iter;
+ typedef block_list::value_type block_descriptor;
+
+ public:
+ dspheap(DSPDevicePtr64 start_addr, uint64_t length)
+ {
+ configure(start_addr, length);
+ }
+
+ dspheap() { }
+
+ void configure(DSPDevicePtr64 start_addr, uint64_t length,
+ bool is_cmem_ondemand_heap = false)
+ {
+ /*---------------------------------------------------------------------
+ * Ensure that the start_addr and length are multiples of 16M.
+ * 16M is the granularity of a memory region that can be controlled
+ * by a MAR register of C6x.
+ *--------------------------------------------------------------------*/
+ //assert((length & 0xFFFFFF) == 0);
+ //assert(((uint32_t)start_addr & 0xFFFFFF) == 0);
+
+ p_start_addr = start_addr;
+ p_length = length;
+ p_block_size = is_cmem_ondemand_heap ? MIN_CMEM_ONDEMAND_BLOCK_SIZE
+ : MIN_BLOCK_SIZE;
+
+ Lock lock(this);
+ if (free_list.empty())
+ free_list[start_addr] = length;
+ }
+
+ ~dspheap() { }
+
+ DSPDevicePtr64 malloc(uint32_t size, bool allow_fail=false)
+ {
+ size = min_block_size(size);
+
+ Lock lock(this);
+ for (block_iter it = free_list.begin(); it != free_list.end(); ++it)
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if (block_size >= size)
+ {
+ free_list.erase(it);
+ alloc_list[block_addr] = size;
+
+ /*-------------------------------------------------------------
+ * if we only use a portion of the free block
+ *------------------------------------------------------------*/
+ if (block_size > size)
+ free_list[(DSPDevicePtr64)block_addr+size] = block_size-size;
+
+ return block_addr;
+ }
+ }
+
+ if (!allow_fail)
+ {
+ printf("Malloc failed for size 0x%x from range (0x%08llx, 0x%08llx)\n",
+ size, p_start_addr, p_start_addr+p_length-1);
+ abort();
+ }
+
+ return 0;
+ }
+
+ int free(DSPDevicePtr64 addr)
+ {
+ /*---------------------------------------------------------------------
+ * Nothing to do if not an allocated address
+ *--------------------------------------------------------------------*/
+ Lock lock(this);
+ block_iter it = alloc_list.find(addr);
+ if (it == alloc_list.end()) return -1;
+
+ uint32_t size = (*it).second;
+ alloc_list.erase(it);
+
+ /*---------------------------------------------------------------------
+ * Merge the block with neighboring free blocks
+ *--------------------------------------------------------------------*/
+ it = free_list.begin();
+ while (it != free_list.end())
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if ( block_addr + block_size == addr
+ || addr + size == block_addr)
+ {
+ block_iter merge_it = it;
+ if (block_addr < addr) addr = block_addr;
+ size = block_size + size;
+ ++it;
+ free_list.erase(merge_it);
+ continue;
+ }
+ ++it;
+ }
+ free_list[addr] = size;
+ return 0;
+ }
+
+ DSPDevicePtr64 size() const { return p_length; }
+
+ DSPDevicePtr64 max_block_size(uint64_t &size, uint32_t &block_size)
+ {
+ if (p_length < p_block_size)
+ {
+ block_size = p_block_size;
+ size = 0;
+ return 0;
+ }
+
+ DSPDevicePtr64 max_block_addr = 0;
+ uint64_t max_block_size = p_block_size;
+
+ Lock lock(this);
+ for (block_iter it = free_list.begin(); it != free_list.end(); ++it)
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if (block_size >= max_block_size)
+ {
+ max_block_addr = block_addr;
+ max_block_size = block_size;
+ }
+ }
+
+ block_size = p_block_size;
+ size = max_block_size;
+ return max_block_addr;
+ }
+
+ private:
+ block_list free_list;
+ block_list alloc_list;
+ DSPDevicePtr64 p_start_addr;
+ uint64_t p_length;
+ uint32_t p_block_size;
+
+ uint32_t min_block_size(uint32_t size) { return ROUNDUP(size, p_block_size); }
+};
+
+#endif // _DSPHEAP_H
diff --git a/src/core/dsp/dspmem.h b/src/core/dsp/dspmem.h
new file mode 100644
index 0000000..f6c7c64
--- /dev/null
+++ b/src/core/dsp/dspmem.h
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdint.h>
+#ifndef _DSPMEM_H
+#define _DSPMEM_H
+
+
+typedef uint32_t DSPDevicePtr;
+typedef uint64_t DSPDevicePtr64;
+typedef uint32_t DSPVirtPtr;
+// typedef uint64_t DSPVirtPtr64; // for future C7x?
+
+/*****************************************************************************
+ * DSP Device Memory Physical Addreess (8GB)
+ * 0x8:0000_0000 - 0x8:1FFF_FFFF: Linux reserved
+ * 0x8:2000_0000 - 0x8:21FF_FFFF: OCL runtime reserved
+ * using default MPAX translation, map to
+ * DSP virtual address 0xA000_0000 - 0xA1FF_FFFF
+ * 0x8:2200_0000 - 0x8:3FFF_FFFF: using default MPAX translation, map to
+ * DSP virtual address 0xA200_0000 - 0xBFFF_FFFF
+ * used for kernel code, user app small buffers
+ * 0x8:4000_0000 - 0x9:FFFF_FFFF: using custom MPAX translation settings, map
+ * to unused DSP virtual address spaces
+ * used for user app big buffers
+ *****************************************************************************/
+#define DSP_36BIT_ADDR 0x800000000ULL
+#define MPAX_USER_MAPPED_DSP_ADDR 0x840000000ULL
+#define ALL_PERSISTENT_MAX_DSP_ADDR 0x880000000ULL
+
+#define MSMC_OCL_START_ADDR 0x0C040000
+#define MSMC_OCL_END_ADDR 0x0C500000
+
+
+#endif // _DSPMEM_H
diff --git a/src/core/dsp/genfile_cache.cpp b/src/core/dsp/genfile_cache.cpp
new file mode 100644
index 0000000..c9b2472
--- /dev/null
+++ b/src/core/dsp/genfile_cache.cpp
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "genfile_cache.h"
+
+std::string genfile_cache::lookup(llvm::Module *module, std::string options)
+{
+ std::vector<std::vector<std::string> > result;
+ uint32_t hash = convert_mod2crc(module, options);
+
+ std::string query("select value from programs where hash = " +
+ boost::lexical_cast<std::string>(hash));
+
+ result = p_database.query(query.c_str());
+
+ if (!result.empty())
+ {
+ string &filename = result[0][0];
+
+ struct stat statbuf;
+ if (stat(filename.c_str(), &statbuf) == 0)
+ return filename;
+ /*-----------------------------------------------------------------
+ * if (the cached filename no longer exists, remove it from the DB
+ *----------------------------------------------------------------*/
+ else
+ {
+ std::string q2("delete from programs where hash = " +
+ boost::lexical_cast<std::string>(hash));
+
+ p_database.query(q2.c_str());
+ return std::string();
+ }
+ }
+ else return std::string();
+}
+
+void genfile_cache::remember(const char *outfile, llvm::Module *module,
+ std::string options)
+{
+ uint32_t hash = convert_mod2crc(module, options);
+ std::string query("insert into programs(hash, value) values("
+ + boost::lexical_cast<std::string>(hash)
+ + ", \""
+ + string(outfile)
+ + "\");");
+
+ p_database.query(query.c_str());
+}
+
+uint32_t genfile_cache::convert_mod2crc(llvm::Module *module,
+ std::string options)
+{
+ string llvm_ir;
+
+ llvm::raw_string_ostream ostream(llvm_ir);
+ llvm::WriteBitcodeToFile(module, ostream);
+ ostream.str();
+
+ llvm_ir += options;
+
+ return get_crc(llvm_ir);
+}
+
+uint32_t genfile_cache::get_crc(std::string& my_string)
+{
+ boost::crc_32_type result;
+ result.process_bytes(my_string.data(), my_string.length());
+ return result.checksum();
+}
diff --git a/src/core/dsp/genfile_cache.h b/src/core/dsp/genfile_cache.h
new file mode 100644
index 0000000..46b27f2
--- /dev/null
+++ b/src/core/dsp/genfile_cache.h
@@ -0,0 +1,101 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _genfile_cache_
+#define _genfile_cache_
+
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/crc.hpp>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <stdint.h>
+#include "u_locks_pthread.h"
+#include "database.h"
+
+class genfile_cache
+{
+ public:
+ std::string lookup (llvm::Module *module, std::string options);
+ void remember (const char *outfile, llvm::Module *module,
+ std::string options);
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static genfile_cache* instance ()
+ {
+ static Mutex Cache_instance_mutex;
+ genfile_cache* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cache_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ char *user = getenv("USER");
+ tmp = new genfile_cache("/tmp/opencl_ofdb_" + string(user));
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+
+ private:
+ static genfile_cache* pInstance;
+ std::string p_dbname;
+ Database p_database;
+
+ private:
+ genfile_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str())
+ {
+ p_database.query("create table if not exists "
+ "programs(hash integer, value string);");
+ }
+
+ uint32_t convert_mod2crc (llvm::Module *module, std::string options);
+ uint32_t get_crc (std::string& my_string);
+
+ genfile_cache(const genfile_cache&); // copy ctor disallowed
+ genfile_cache& operator=(const genfile_cache&); // assignment disallowed
+};
+
+#endif // _genfile_cache_
diff --git a/src/core/dsp/kernel.cpp b/src/core/dsp/kernel.cpp
new file mode 100644
index 0000000..291673a
--- /dev/null
+++ b/src/core/dsp/kernel.cpp
@@ -0,0 +1,718 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "kernel.h"
+#include "device.h"
+#include "buffer.h"
+#include "program.h"
+#include "utils.h"
+#include "u_locks_pthread.h"
+#include "mailbox.h"
+
+#include "../kernel.h"
+#include "../memobject.h"
+#include "../events.h"
+#include "../program.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern "C"
+{
+ #include <ti/runtime/mmap/include/mmap_resource.h>
+}
+
+
+#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1))
+#define QERR(msg, retcode) do {if (getenv("TI_OCL_VERBOSE_ERROR")) std::cerr << msg << std::endl; return retcode; } while(0)
+#define ERR(x) std::cerr << x << std::endl
+#define ERROR() std::cerr << "Unknown error in dsp/kernel.cpp" << std::endl
+
+using namespace Coal;
+
+DSPKernel::DSPKernel(DSPDevice *device, Kernel *kernel)
+: DeviceKernel(), p_device(device), p_kernel(kernel),
+ p_device_entry_pt((DSPDevicePtr)0),
+ p_data_page_ptr ((DSPDevicePtr)0xffffffff)
+{
+}
+
+DSPKernel::~DSPKernel()
+{
+}
+
+
+template<typename T>
+T k_exp(T base, unsigned int e)
+{
+ T rs = base;
+ for (unsigned int i=1; i<e; ++i) rs *= base;
+ return rs;
+}
+
+/*-----------------------------------------------------------------------------
+* This and the next function are called from the multiple worker threads. They
+* may all enter the set the name section, but they will all set the same value,
+* so even though there is a race, there is no race error. when work group
+* division is pushed down to the dsp, the race will go away.
+*----------------------------------------------------------------------------*/
+DSPDevicePtr DSPKernel::device_entry_pt()
+{
+ if (!p_device_entry_pt)
+ {
+ size_t name_length;
+ p_kernel->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length);
+
+ void *name = malloc(name_length);
+ p_kernel->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0);
+
+ Program *p = (Program *)p_kernel->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ if (!prog->is_loaded()) ERROR();
+ p_device_entry_pt = prog->query_symbol((char*)name);
+ free (name);
+ }
+ return p_device_entry_pt;
+}
+
+/******************************************************************************
+* The data page pointer can frequently be 0, so we will initialize it to be
+* 0xffffffff as a start value instead of 0.
+******************************************************************************/
+DSPDevicePtr DSPKernel::data_page_ptr()
+{
+ if (p_data_page_ptr == (DSPDevicePtr)0xffffffff)
+ {
+ Program *p = (Program *)p_kernel->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ if (!prog->is_loaded()) ERROR();
+ //p_data_page_ptr = prog->query_symbol("__TI_STATIC_BASE");
+ p_data_page_ptr = prog->data_page_ptr();
+ }
+ return p_data_page_ptr;
+}
+
+/******************************************************************************
+* void DSPKernel::preAllocBuffers()
+******************************************************************************/
+cl_int DSPKernel::preAllocBuffers()
+{
+ for (unsigned int i=0; i < kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg &arg = kernel()->arg(i);
+
+ if (arg.kind() == Kernel::Arg::Buffer &&
+ arg.file() != Kernel::Arg::Local)
+ {
+ MemObject *buffer = *(MemObject **)arg.data();
+ if (buffer && !buffer->allocate(device()))
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ }
+ }
+ return CL_SUCCESS;
+}
+
+
+/******************************************************************************
+* Try to find the size a work group needs to be executed the fastest on the DSP.
+******************************************************************************/
+size_t DSPKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const
+{
+ // ASW TODO - what the ????
+ unsigned int dsps = p_device->numDSPs();
+
+ /*-------------------------------------------------------------------------
+ * Don't break in too small parts
+ *------------------------------------------------------------------------*/
+ if (k_exp(global_work_size, num_dims) > 64)
+ return global_work_size;
+
+ /*-------------------------------------------------------------------------
+ * Find the divisor of global_work_size the closest to dsps but >= than it
+ *------------------------------------------------------------------------*/
+ unsigned int divisor = dsps <= 0 ? 1 : dsps;
+
+ while (true)
+ {
+ if ((global_work_size % divisor) == 0)
+ break;
+
+ /*---------------------------------------------------------------------
+ * Don't let the loop go up to global_work_size, the overhead would be
+ * too huge
+ *--------------------------------------------------------------------*/
+ if (divisor > global_work_size || divisor > dsps * 32)
+ {
+ divisor = 1; // Not parallel but has no CommandQueue overhead
+ break;
+ }
+
+ divisor -= 1;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Return the size
+ *------------------------------------------------------------------------*/
+ return global_work_size / divisor;
+}
+
+/******************************************************************************
+* localMemSize()
+******************************************************************************/
+cl_ulong DSPKernel::localMemSize() const
+{
+ cl_ulong local_mem = 0;
+
+ for (int i = 0; i < kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg &arg = kernel()->arg(i);
+
+ if (arg.kind() == Kernel::Arg::Buffer &&
+ arg.file() == Kernel::Arg::Local)
+ local_mem += arg.allocAtKernelRuntime();
+ }
+
+ return local_mem;
+}
+
+Kernel * DSPKernel::kernel() const { return p_kernel; }
+DSPDevice * DSPKernel::device() const { return p_device; }
+
+// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
+template <class T>
+T next_power_of_two(T k)
+{
+ if (k == 0) return 1;
+
+ k--;
+ for (int i=1; i<sizeof(T)*8; i<<=1)
+ k = k | k >> i;
+ return k+1;
+}
+
+size_t DSPKernel::typeOffset(size_t &offset, size_t type_len)
+{
+ size_t rs = offset;
+
+ // Align offset to stype_len
+ type_len = next_power_of_two(type_len);
+ if (type_len > 8) type_len = 8; // The c66 has no alignment need > 8 bytes
+
+ size_t mask = ~(type_len - 1);
+
+ while (rs & mask != rs)
+ rs++;
+
+ // Where to try to place the next value
+ offset = rs + type_len;
+
+ return rs;
+}
+
+static int kernelID = 0;
+
+/*=============================================================================
+* DSPKernelEvent
+*============================================================================*/
+DSPKernelEvent::DSPKernelEvent(DSPDevice *device, KernelEvent *event)
+: p_device(device), p_event(event), p_kernel((DSPKernel*)event->deviceKernel()),
+ p_kernel_id(kernelID++), p_debug_kernel(false), p_num_arg_words(0),
+ p_WG_alloca_start(0)
+{
+ char *dbg = getenv("TI_OCL_DEBUG_KERNEL");
+ if (dbg) p_debug_kernel = true;
+
+ callArgs(MAX_ARG_BUF_SIZE);
+}
+
+DSPKernelEvent::~DSPKernelEvent() { }
+
+#define READ_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_READ_ONLY)
+#define WRITE_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_WRITE_ONLY)
+
+#define SETARG(val) if (arg_words < args_in_mem_size) args_in_mem[arg_words++] = val; \
+ else std::cerr << "To many argument bytes are needed" << std::endl
+
+#define SETMOREARG(sz, pval) do \
+ { \
+ more_arg_offset = ROUNDUP(more_arg_offset, sz); \
+ if (ROUNDUP(more_arg_offset + sz, 8) > sizeof(p_msg.u.k.flush.buffers))\
+ std::cerr << "Too many arguments, does not fit" << std::endl; \
+ memcpy(more_args_in_mem+more_arg_offset, pval, sz); \
+ more_arg_offset += sz; \
+ } while(0)
+
+//#define SETMOREARG(sz,psrc)
+
+/******************************************************************************
+* DSPKernelEvent::callArgs
+******************************************************************************/
+void DSPKernelEvent::callArgs(unsigned args_in_mem_size)
+{
+ int arg_words = 0;
+ unsigned *args_in_mem = (unsigned*)p_msg.u.k.kernel.argBuf;
+ char *more_args_in_mem = (char *)p_msg.u.k.flush.buffers;
+ int more_arg_offset = 4;
+ bool is_more_arg = false;
+
+ /*-------------------------------------------------------------------------
+ * Write Arguments
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < p_kernel->kernel()->numArgs(); ++i)
+ {
+ is_more_arg = (i >= 10);
+
+ const Kernel::Arg & arg = p_kernel->kernel()->arg(i);
+ size_t size = arg.valueSize() * arg.vecDim();
+
+ if (size == 0) ERR("Kernel Argument has size == 0");
+ if (size != 1 && size != 2 && size != 4 && size != 8)
+ ERR("Invalid Kernel Argument size");
+
+ /*---------------------------------------------------------------------
+ * We may have to perform some changes in the values (buffers, etc)
+ *--------------------------------------------------------------------*/
+ switch (arg.kind())
+ {
+ case Kernel::Arg::Buffer:
+ {
+ MemObject *buffer = 0;
+ DSPDevicePtr buf_ptr = 0;
+ if (arg.data()) buffer = *(MemObject **)arg.data();
+ if (!is_more_arg) SETARG(sizeof(DSPVirtPtr));
+
+ DSPVirtPtr *buf_dspvirtptr = (!is_more_arg) ?
+ (&args_in_mem[arg_words]) :
+ (DSPVirtPtr *)(more_args_in_mem+ROUNDUP(more_arg_offset,4));
+
+ /*-------------------------------------------------------------
+ * Alloc a buffer and pass it to the kernel
+ *------------------------------------------------------------*/
+ if (arg.file() == Kernel::Arg::Local)
+ {
+ uint32_t lbufsz = arg.allocAtKernelRuntime();
+ p_local_bufs.push_back(LocalPair(buf_dspvirtptr, lbufsz));
+
+ /*-----------------------------------------------------
+ * Since the only reader and writer of local memory (L2)
+ * will be the core itself, I do not believe we need
+ * to flush local buffers for correctness.
+ *----------------------------------------------------*/
+ //p_flush_bufs->push_back(DSPMemRange(lbuf, lbufsz));
+ }
+ else if (buffer != NULL)
+ {
+ /*---------------------------------------------------------
+ * Get the DSP buffer, allocate it and get its pointer
+ *--------------------------------------------------------*/
+ if (buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ p_hostptr_tmpbufs.push_back(
+ HostptrPair(buffer, DSPPtrPair(0, buf_dspvirtptr)));
+ }
+ else
+ {
+ DSPBuffer *dspbuf = (DSPBuffer *)buffer->deviceBuffer(p_device);
+ buffer->allocate(p_device);
+ DSPDevicePtr64 addr64 = dspbuf->data();
+ if (addr64 < 0xFFFFFFFF)
+ buf_ptr = addr64;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ addr64, buf_dspvirtptr), buffer->size()));
+
+ if (! WRITE_ONLY_BUFFER(buffer))
+ p_flush_bufs.push_back(DSPMemRange(DSPPtrPair(
+ addr64, buf_dspvirtptr), buffer->size()));
+ }
+ }
+
+ /*---------------------------------------------------------
+ * Use 0 for local buffer address here, it will be overwritten
+ * with allocated local buffer address at kernel dispatch time.
+ * Same for allocating temporary buffer for use_host_ptr.
+ *--------------------------------------------------------*/
+ if (!is_more_arg) SETARG(buf_ptr);
+ else { SETMOREARG(4, &buf_ptr); }
+
+ break;
+ }
+
+ case Kernel::Arg::Image2D:
+ case Kernel::Arg::Image3D: ERR("Images not yet supported"); break;
+
+ /*-----------------------------------------------------------------
+ * Non-Buffers
+ *----------------------------------------------------------------*/
+ default:
+ if (!is_more_arg)
+ {
+ SETARG((size < 4 ? 4 : size));
+ // Cast to (int) to avoid a codegen bug
+ // ZEXT will happen in LLVM and ICODE, so don't worry
+ if (size == 1) SETARG(((int) *((signed char*)arg.data())));
+ else if (size == 2) SETARG(((int) *((short*)arg.data())));
+ else SETARG(*((unsigned*) arg.data()));
+ if (size == 8) { SETARG(*(((unsigned*)arg.data()) + 1)); }
+ }
+ else { SETMOREARG(size, arg.data()); }
+ break;
+ }
+ }
+ SETARG(0); // 0 terminator for args area
+
+ p_num_arg_words = arg_words;
+ p_msg.u.k.flush.sizeMoreArgs = (more_arg_offset > 4) ?
+ ROUNDUP(more_arg_offset, 8) : 0;
+}
+
+/******************************************************************************
+* debug_pause
+******************************************************************************/
+static void debug_pause(uint32_t entry, uint32_t dsp_id,
+ const char* outfile, char *name)
+{
+ printf("[OCL] Launching kernel %s on DSP %d\n", name, dsp_id);
+ printf("[OCL] Connect debugger and set breakpoint at 0x%08x\n", entry);
+ printf("[OCL] Load symbols from file %s\n", outfile);
+ printf("[OCL] Press any key, then enter to continue\n");
+ do { char t; std::cin >> t; } while(0);
+}
+
+
+
+/******************************************************************************
+* bool DSPKernelEvent::run()
+******************************************************************************/
+cl_int DSPKernelEvent::run(Event::Type evtype)
+{
+ Program *p = (Program *)p_kernel->kernel()->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ // TODO perhaps ensure that prog is loaded.
+
+ int dim = p_event->work_dim();
+
+ /*-------------------------------------------------------------------------
+ * Create a message for the DSP
+ *------------------------------------------------------------------------*/
+ Msg_t &msg = p_msg;
+ kernel_config_t *cfg = &msg.u.k.kernel.config;
+
+ if (evtype == Event::TaskKernel)
+ {
+ msg.command = TASK;
+ cfg->Kernel_id = p_kernel_id;
+
+ CommandQueue *q = (CommandQueue *) p_event->parent();
+ cl_command_queue_properties q_prop = 0;
+ q->info(CL_QUEUE_PROPERTIES, sizeof(q_prop), &q_prop, NULL);
+ cfg->global_sz_0 = (q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) ?
+ OUT_OF_ORDER_TASK_SIZE : IN_ORDER_TASK_SIZE;
+ cfg->local_sz_0 = 1;
+ cfg->local_sz_1 = 1;
+ cfg->local_sz_2 = 1;
+ }
+ else
+ {
+ msg.command = NDRKERNEL;
+
+ cfg->num_dims = dim;
+ cfg->global_sz_0 = p_event->global_work_size(0);
+ cfg->global_sz_1 = dim > 1 ? p_event->global_work_size(1) : 1;
+ cfg->global_sz_2 = dim > 2 ? p_event->global_work_size(2) : 1;
+ cfg->local_sz_0 = p_event->local_work_size(0);
+ cfg->local_sz_1 = dim > 1 ? p_event->local_work_size(1) : 1;
+ cfg->local_sz_2 = dim > 2 ? p_event->local_work_size(2) : 1;
+ cfg->global_off_0 = p_event->global_work_offset(0);
+ cfg->global_off_1 = p_event->global_work_offset(1);
+ cfg->global_off_2 = p_event->global_work_offset(2);
+ cfg->WG_gid_start_0 = 0;
+ cfg->WG_gid_start_1 = 0;
+ cfg->WG_gid_start_2 = 0;
+ cfg->Kernel_id = p_kernel_id;
+ cfg->WG_id = 0;
+ cfg->stats = 0;
+ }
+
+ msg.u.k.kernel.entry_point = (unsigned)p_kernel->device_entry_pt();
+ msg.u.k.kernel.data_page_ptr = (unsigned)p_kernel->data_page_ptr();
+
+ /*-------------------------------------------------------------------------
+ * Allocating local buffer in L2 per kernel run instance
+ *------------------------------------------------------------------------*/
+ uint32_t total_sz, block_sz;
+ DSPDevicePtr local_scratch = p_device->get_local_scratch(total_sz, block_sz);
+ for (size_t i = 0; i < p_local_bufs.size(); ++i)
+ {
+ DSPVirtPtr *p_arg_word = p_local_bufs[i].first;
+ unsigned local_buf_size = p_local_bufs[i].second;
+
+ uint32_t rounded_sz = ROUNDUP(local_buf_size, block_sz);
+ if (rounded_sz > total_sz)
+ {
+ QERR("Total local buffer size exceeds available local size",
+ CL_MEM_OBJECT_ALLOCATION_FAILURE);
+ }
+ *p_arg_word = local_scratch;
+ local_scratch += rounded_sz;
+ total_sz -= rounded_sz;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Allocating temporary space in global memory for kernel alloca'ed data
+ *------------------------------------------------------------------------*/
+#define NUM_CORES_PER_CHIP 8
+ cfg->WG_alloca_size = p_kernel->kernel()->get_wi_alloca_size() *
+ cfg->local_sz_0 * cfg->local_sz_1 * cfg->local_sz_2;
+ if (cfg->WG_alloca_size > 0)
+ {
+ cfg->WG_alloca_size += 4096; // 4K bytes padding between WGs' allocas
+ uint32_t chip_alloca_size = cfg->WG_alloca_size * NUM_CORES_PER_CHIP;
+ p_WG_alloca_start = p_device->malloc_global( // malloc abort if fail
+ chip_alloca_size, true);
+ if (!p_WG_alloca_start)
+ {
+ QERR("Alloca size exceeds available global memory",
+ CL_OUT_OF_RESOURCES);
+ }
+
+ if (p_WG_alloca_start < 0xFFFFFFFF)
+ cfg->WG_alloca_start = (DSPVirtPtr) p_WG_alloca_start;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ p_WG_alloca_start, &cfg->WG_alloca_start), chip_alloca_size));
+ }
+
+ /*-------------------------------------------------------------------------
+ * Allocating temporary global buffer for use_host_ptr
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i)
+ {
+ MemObject *buffer = p_hostptr_tmpbufs[i].first;
+ DSPDevicePtr64 *p_addr64 = &p_hostptr_tmpbufs[i].second.first;
+ DSPVirtPtr *p_arg_word = p_hostptr_tmpbufs[i].second.second;
+
+ *p_addr64 = p_device->malloc_global(buffer->size(), false);
+
+ if (!p_addr64)
+ {
+ QERR("Temporary memory for CL_MEM_USE_HOST_PTR buffer exceeds available global memory",
+ CL_MEM_OBJECT_ALLOCATION_FAILURE);
+ }
+
+ if (*p_addr64 < 0xFFFFFFFF)
+ *p_arg_word = *p_addr64;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ *p_addr64, p_arg_word), buffer->size()));
+
+ if (! WRITE_ONLY_BUFFER(buffer))
+ {
+ void *mapped_tmpbuf = Driver::instance()->map(*p_addr64,
+ buffer->size(), false);
+ memcpy(mapped_tmpbuf, buffer->host_ptr(), buffer->size());
+ p_flush_bufs.push_back(DSPMemRange(DSPPtrPair(
+ *p_addr64, p_arg_word), buffer->size()));
+ Driver::instance()->unmap(mapped_tmpbuf, *p_addr64,
+ buffer->size(), true);
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Compute MPAX mappings from DSPDevicePtr64 to DSPVirtPtr in p_64bit_bufs
+ *------------------------------------------------------------------------*/
+ msg.u.k.flush.num_mpaxs = 0;
+ uint32_t num_64bit_bufs = p_64bit_bufs.size();
+ if (num_64bit_bufs > 0)
+ {
+ uint64_t *phys_addrs = new uint64_t[num_64bit_bufs];
+ uint32_t *lengths = new uint32_t[num_64bit_bufs];
+ uint32_t *prots = new uint32_t[num_64bit_bufs];
+ uint32_t *virt_addrs = new uint32_t[num_64bit_bufs];
+ for (int i = 0; i < p_64bit_bufs.size(); ++i)
+ {
+ phys_addrs[i] = p_64bit_bufs[i].first.first;
+ lengths[i] = p_64bit_bufs[i].second;
+ prots[i] = 0; // don't care yet
+ }
+
+ keystone_mmap_resources_t mpax_res;
+ memcpy(&mpax_res, p_device->get_mpax_default_res(),
+ sizeof(keystone_mmap_resources_t));
+ if (keystone_mmap_resource_alloc(num_64bit_bufs, phys_addrs, lengths,
+ prots, virt_addrs, &mpax_res) != KEYSTONE_MMAP_RESOURCE_NOERR)
+ {
+ QERR("MPAX allocation failed!",
+ CL_OUT_OF_RESOURCES);
+ }
+
+ // set the MPAX settings in the message
+ uint32_t mpax_used = 0;
+ for (; mpax_res.mapping[mpax_used].segsize_power2 > 0; mpax_used += 1)
+ {
+ msg.u.k.flush.mpax_settings[2*mpax_used ] = (uint32_t)
+ (mpax_res.mapping[mpax_used].raddr >> 12); // e.g. 0x822004
+ msg.u.k.flush.mpax_settings[2*mpax_used+1] = // e.g. 0xC000000D
+ mpax_res.mapping[mpax_used].baddr
+ | (mpax_res.mapping[mpax_used].segsize_power2-1);
+ }
+ msg.u.k.flush.num_mpaxs = mpax_used;
+
+ // set the virtual address in arguments
+ for (int i = 0; i < p_64bit_bufs.size(); ++i)
+ {
+ *(p_64bit_bufs[i].first.second) = virt_addrs[i];
+ if (p_debug_kernel)
+ printf("Virtual = 0x%x, physical = 0x%llx\n",
+ virt_addrs[i], p_64bit_bufs[i].first.first);
+ }
+ delete [] phys_addrs;
+ delete [] lengths;
+ delete [] prots;
+ delete [] virt_addrs;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Helpful information for debugging a kernel
+ *------------------------------------------------------------------------*/
+ if (p_debug_kernel)
+ {
+ for (int i = 0; i < msg.u.k.flush.num_mpaxs; i++)
+ printf("mpax %d: l=0x%x, h=0x%x\n", i,
+ msg.u.k.flush.mpax_settings[2*i],
+ msg.u.k.flush.mpax_settings[2*i+1]);
+
+ uint32_t *args = msg.u.k.kernel.argBuf;
+ int arg_num = 1;
+ // TODO: print more args properly
+ for (int i=0; i < p_num_arg_words; i++)
+ {
+ if (args[i] == 4)
+ {
+ i++;
+ printf("[OCL] Kernel argument %d = 0x%08x\n", arg_num, args[i]);
+ }
+ else if (args[i] == 8)
+ {
+ printf("[OCL] Kernel argument %d = 0x%08x 0x%08x\n",
+ arg_num, args[i+1], args[i+2]);
+ i+=2;
+ }
+ arg_num++;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Make sure we do not overflow the number of commands a mailbox can handle
+ *------------------------------------------------------------------------*/
+ if (p_flush_bufs.size() > MAX_KERNEL_ARGUMENTS)
+ {
+ QERR("To many buffers to flush", CL_OUT_OF_RESOURCES);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Populate Flush commands for any buffers that are read by the DSP
+ *------------------------------------------------------------------------*/
+ msg.u.k.flush.numBuffers = p_flush_bufs.size();
+
+#if 0 // YUAN: flush buffers used for more arguments (for now)
+ for (int i=0; i < p_flush_bufs.size(); ++i)
+ {
+ msg.u.k.flush.buffers[2*i] = p_flush_bufs[i].first;
+ msg.u.k.flush.buffers[2*i+1] = p_flush_bufs[i].second;
+ }
+#endif
+
+ /*-------------------------------------------------------------------------
+ * Feedback to user for debug
+ *------------------------------------------------------------------------*/
+ if (p_debug_kernel)
+ {
+ size_t name_length;
+ p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length);
+ char *name = (char*)malloc(name_length);
+ if (!name) return CL_OUT_OF_HOST_MEMORY;
+ p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0);
+
+ debug_pause(p_kernel->device_entry_pt(), p_device->dspID(),
+ prog->outfile_name(), name);
+ free (name);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Dispatch the commands through the mailbox
+ *------------------------------------------------------------------------*/
+ p_device->mail_to(msg);
+
+ /*-------------------------------------------------------------------------
+ * Do not wait for completion
+ *------------------------------------------------------------------------*/
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* free_tmp_bufs allocated for kernel allocas, and for use_host_ptr
+******************************************************************************/
+void DSPKernelEvent::free_tmp_bufs()
+{
+ if (p_WG_alloca_start > 0)
+ p_device->free_global(p_WG_alloca_start);
+
+ for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i)
+ {
+ MemObject *buffer = p_hostptr_tmpbufs[i].first;
+ DSPDevicePtr64 addr64 = p_hostptr_tmpbufs[i].second.first;
+
+ if (! READ_ONLY_BUFFER(buffer))
+ {
+ void *mapped_tmpbuf = Driver::instance()->map(addr64,
+ buffer->size(), true);
+ memcpy(buffer->host_ptr(), mapped_tmpbuf, buffer->size());
+ Driver::instance()->unmap(mapped_tmpbuf, addr64,
+ buffer->size(), false);
+ }
+ p_device->free_global(addr64);
+ }
+
+}
+
diff --git a/src/core/dsp/kernel.h b/src/core/dsp/kernel.h
new file mode 100644
index 0000000..850941d
--- /dev/null
+++ b/src/core/dsp/kernel.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_KERNEL_H__
+#define __DSP_KERNEL_H__
+
+#include "../events.h"
+#include "../memobject.h"
+#include "../deviceinterface.h"
+#include "message.h"
+#include "device.h"
+#include <core/config.h>
+
+#include <vector>
+#include <string>
+#include <pthread.h>
+#include <stdint.h>
+
+namespace llvm
+{
+ class Function;
+}
+
+typedef std::pair<DSPDevicePtr64, DSPVirtPtr *> DSPPtrPair;
+typedef std::pair<DSPPtrPair, uint32_t> DSPMemRange;
+typedef std::pair<DSPVirtPtr *, uint32_t> LocalPair;
+typedef std::pair<Coal::MemObject *, DSPPtrPair> HostptrPair;
+
+
+namespace Coal
+{
+class DSPDevice;
+class Kernel;
+class KernelEvent;
+
+class DSPKernel : public DeviceKernel
+{
+ public:
+ DSPKernel(DSPDevice *device, Kernel *kernel);
+ ~DSPKernel();
+
+ size_t workGroupSize() const { return 128; }
+ cl_ulong localMemSize() const ;
+ cl_ulong privateMemSize() const { return 0; }
+ size_t preferredWorkGroupSizeMultiple() const { return 0; }
+
+ size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const;
+ DSPDevicePtr device_entry_pt();
+ DSPDevicePtr data_page_ptr();
+ cl_int preAllocBuffers();
+
+ Kernel * kernel() const;
+ DSPDevice * device() const;
+
+ llvm::Function *function() const;
+ static size_t typeOffset(size_t &offset, size_t type_len);
+
+ private:
+ DSPDevice * p_device;
+ Kernel * p_kernel;
+ DSPDevicePtr p_device_entry_pt;
+ DSPDevicePtr p_data_page_ptr;
+};
+
+class DSPKernelEvent
+{
+ public:
+ DSPKernelEvent (DSPDevice *device, KernelEvent *event);
+ ~DSPKernelEvent ();
+
+ cl_int run (Event::Type evtype);
+ void callArgs (unsigned rs_size);
+
+ DSPDevice* device() { return p_device; }
+ uint32_t kernel_id() { return p_kernel_id; }
+
+ void free_tmp_bufs();
+
+ private:
+ DSPDevice * p_device;
+ KernelEvent * p_event;
+ DSPKernel * p_kernel;
+ uint32_t p_kernel_id;
+ bool p_debug_kernel;
+ int p_num_arg_words;
+ Msg_t p_msg;
+ DSPDevicePtr64 p_WG_alloca_start;
+ std::vector<DSPMemRange> p_flush_bufs;
+ std::vector<LocalPair> p_local_bufs;
+ std::vector<HostptrPair> p_hostptr_tmpbufs;
+ std::vector<DSPMemRange> p_64bit_bufs;
+};
+}
+#endif
diff --git a/src/core/dsp/mailbox.h b/src/core/dsp/mailbox.h
new file mode 100644
index 0000000..f87c08c
--- /dev/null
+++ b/src/core/dsp/mailbox.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _MAILBOX_H_
+#define _MAILBOX_H_
+#include "u_locks_pthread.h"
+#include "driver.h"
+
+extern "C"
+{
+ #include "mpm_mailbox.h"
+}
+
+class Mailbox
+{
+ public:
+
+ int32_t create(void* mbox_handle, char *slave_node_name,
+ uint32_t mem_location, uint32_t direction,
+ mpm_mailbox_config_t *mbox_config)
+ {
+ int32_t result = mpm_mailbox_create(mbox_handle, slave_node_name,
+ mem_location, direction, mbox_config);
+ return result;
+ }
+
+ int32_t open(void* mbox_handle)
+ {
+ int32_t result = mpm_mailbox_open(mbox_handle);
+ return result;
+ }
+
+ int32_t write (void* mbox_handle, uint8_t *buf, uint32_t size,
+ uint32_t trans_id)
+ {
+ int result;
+
+ do result = mpm_mailbox_write (mbox_handle, buf, size, trans_id);
+ while (result == MPM_MAILBOX_ERR_MAIL_BOX_FULL);
+
+ return true;
+ }
+
+ int32_t read (void* mbox_handle, uint8_t *buf, uint32_t *size,
+ uint32_t *trans_id)
+ {
+ int32_t result = mpm_mailbox_read (mbox_handle, buf, size, trans_id);
+ return result;
+ }
+
+ int32_t query (void* mbox_handle)
+ {
+ int32_t result = mpm_mailbox_query (mbox_handle);
+ return result;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static Mailbox* instance ()
+ {
+ static Mutex Mailbox_instance_mutex;
+ Mailbox* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Mailbox_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Mailbox;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+ private:
+ static Mailbox* pInstance;
+
+ Mailbox() { } // ctor private
+ Mailbox(const Mailbox&); // copy ctor disallowed
+ Mailbox& operator=(const Mailbox&); // assignment disallowed
+};
+
+#endif // _MAILBOX_H_
diff --git a/src/core/dsp/memmap.h b/src/core/dsp/memmap.h
new file mode 100644
index 0000000..503540e
--- /dev/null
+++ b/src/core/dsp/memmap.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+class DSP_MappedMem
+{
+ public:
+ DSP_MappedMem(uint32_t dsp_id, uint32_t size)
+ : p_size(size), p_dsp_id(dsp_id), p_dsp_addr(0)
+ p_num_buffers(CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE))
+ {
+ p_buffers = new [p_num_buffers] cmem_host_buf_desc_t;
+ ERR(!p_buffers, "Cannot allocate host memory for a DSP Mapped Region");
+
+ int status
+ for (int i = 0; i< num_buffers; i++)
+ {
+ status = bufmgrAlloc(DmaBufPool, 1, &p_buffers[i]);
+ ERR(status, "Cannot allocate CMEM pool for a DSP Mapped Region");
+ }
+
+ /*---------------------------------------------------------------------
+ * Allocate DSP range
+ *--------------------------------------------------------------------*/
+ status = pciedrv_dsp_memrange_alloc(dsp_id, size, p_dsp_addr);
+ ERR(status, "PCIe driver dsp memrange alloc failed");
+
+ /*---------------------------------------------------------------------
+ * Map Input buffers to dsp range
+ *--------------------------------------------------------------------*/
+ status = pciedrv_map_bufs_to_dsp_memrange(dsp_id, num_buffers,
+ p_buffers, (uint32_t) p_dsp_addr);
+ ERR(status, "PCIe driver dsp map bufs to memrange failed");
+ }
+
+ ~DSP_MappedMem()
+ {
+ /*---------------------------------------------------------------------
+ * Free DSP range
+ *--------------------------------------------------------------------*/
+ int status = pciedrv_dsp_memrange_free(dsp_id, size, p_dsp_addr);
+ ERR(status, "PCIe driver dsp memrange free failed");
+
+ for (int i = 0; i< num_buffers; i++)
+ {
+ status = bufmgrFreeDesc(DmaBufPool, &p_buffers[i]);
+ ERR(status, "Cannot free CMEM pool for a DSP Mapped Region");
+ }
+
+ delete [p_num_buffers] p_buffers;
+ }
+
+ void copy_in(void* p, uint32_t size)
+ {
+ ERR(size > p_size, "DSP Mapped region input overflow");
+
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+
+ for (int i = 0; remaining_size; i++)
+ {
+ int chunk_size = std::min(remaining_size, p_buffers[i].length);
+
+ memcpy(p_buffers[i].user_addr, p + offset, chunk_size);
+
+ remaining_size -= chunk_size;
+ offset += chunk_size;
+ }
+ }
+
+ void copy_out(void* p, uint32_t size)
+ {
+ ERR(size > p_size, "DSP Mapped region output underrflow");
+
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+
+ for (int i = 0; remaining_size; i++)
+ {
+ int chunk_size = std::min(remaining_size, p_buffers[i].length);
+
+ memcpy(p + offset, p_buffers[i].user_addr, chunk_size);
+
+ remaining_size -= chunk_size;
+ offset += chunk_size;
+ }
+ }
+
+ private:
+ uint32_t p_size;
+ uint32_t p_dsp_id;
+ uint32_t p_dsp_addr;
+ uint32_t p_num_buffers;
+ cmem_host_buf_desc_t *p_buffers;
+};
diff --git a/src/core/dsp/message.h b/src/core/dsp/message.h
new file mode 100644
index 0000000..d93fe1e
--- /dev/null
+++ b/src/core/dsp/message.h
@@ -0,0 +1,115 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __MESSAGE_H_
+#define __MESSAGE_H_
+
+#include <stdint.h>
+
+typedef enum { READY, EXIT, TASK, NDRKERNEL, WORKGROUP, CACHEINV, FREQUENCY, SUCCESS, ERROR, PRINT } command_codes;
+
+#define MAX_KERNEL_ARGUMENTS 10
+#define MAX_ARG_BUF_SIZE (MAX_KERNEL_ARGUMENTS*3)+1
+#define MAX_FLUSH_BUF_SIZE (MAX_KERNEL_ARGUMENTS*2)
+
+#define MAX_XMCSES_MPAXS 7
+#define FIRST_FREE_XMC_MPAX 3 // XMC MPAXs available: 3 - F
+#define FIRST_FREE_SES_MPAX 1 // SES MPAXs available: 1 - 7
+
+/******************************************************************************
+* Need to ensure that the alignments and therefore the offsets of all fields
+* are consistent between the host and the device.
+******************************************************************************/
+typedef struct
+{
+ uint32_t num_dims;
+
+ uint32_t global_sz_0;
+ uint32_t global_sz_1;
+ uint32_t global_sz_2;
+ uint32_t local_sz_0;
+ uint32_t local_sz_1;
+ uint32_t local_sz_2;
+ uint32_t global_off_0;
+ uint32_t global_off_1;
+ uint32_t global_off_2;
+ uint32_t WG_gid_start_0;
+ uint32_t WG_gid_start_1;
+ uint32_t WG_gid_start_2;
+ uint32_t Kernel_id;
+ uint32_t WG_id;
+ uint32_t stats;
+ uint32_t WG_alloca_start;
+ uint32_t WG_alloca_size;
+} kernel_config_t;
+
+typedef struct
+{
+ uint8_t numBuffers;
+ uint8_t num_mpaxs; // TODO: XMC only mpax for kernel alloca memory
+ uint16_t sizeMoreArgs;
+ uint32_t buffers[MAX_FLUSH_BUF_SIZE];
+ uint32_t mpax_settings[2*MAX_XMCSES_MPAXS]; // (MPAXL, MPAXH) pair
+} flush_msg_t;
+
+typedef struct
+{
+ kernel_config_t config;
+ uint32_t entry_point;
+ uint32_t data_page_ptr;
+ uint32_t argBuf[MAX_ARG_BUF_SIZE]; // NULL size terminated
+} kernel_msg_t;
+
+typedef struct
+{
+ command_codes command;
+ union
+ {
+ struct
+ {
+ kernel_msg_t kernel;
+ flush_msg_t flush;
+ } k;
+ char message[sizeof(kernel_msg_t) + sizeof(flush_msg_t)];
+ } u;
+} Msg_t;
+
+static Msg_t exitMsg = {EXIT};
+static Msg_t successMsg = {SUCCESS};
+static Msg_t readyMsg = {READY};
+static Msg_t errorMsg = {ERROR};
+static Msg_t frequencyMsg = {FREQUENCY};
+// static far Msg_t printMsg = {PRINT}; // moved to L2 in monitor
+
+static const uint32_t mbox_payload = sizeof(Msg_t);
+
+#define MBOX_SIZE 0x2000
+
+#define IN_ORDER_TASK_SIZE 1
+#define OUT_OF_ORDER_TASK_SIZE (IN_ORDER_TASK_SIZE+1)
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c
new file mode 100644
index 0000000..545ba92
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c
@@ -0,0 +1,200 @@
+/*
+* c60_dynamic.c
+*
+* C6x-specific dynamic loader functionality
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifdef C60_TARGET
+#include "c60_elf32.h"
+#include <inttypes.h>
+#include "dload.h"
+
+/*****************************************************************************/
+/* c60_process_dynamic_tag() */
+/* */
+/* Process C6x specific dynamic tags. */
+/*****************************************************************************/
+BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i)
+{
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_GSYM_OFFSET: Dynamic symbol table is partitioned into */
+ /* local and global symbols. This tag has the */
+ /* offset into the dynamic symbol table where */
+ /* the global symbol table starts. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_GSYM_OFFSET:
+ dyn_module->gsymtab_offset = dyn_module->dyntab[i].d_un.d_val;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found global symbol table: %d\n",
+ dyn_module->gsymtab_offset);
+#endif
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_GSTR_OFFSET: Contains the offset into the dynamic */
+ /* string table where the global symbol names */
+ /* start. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_GSTR_OFFSET:
+ dyn_module->gstrtab_offset = dyn_module->dyntab[i].d_un.d_val;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found global string table: %d\n",
+ dyn_module->gstrtab_offset);
+#endif
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_BASE: Contains address of DSBT in executable or */
+ /* shared object. */
+ /* We store the tag's location in the dynamic */
+ /* module object so that we can update it */
+ /* easily after the sections have been */
+ /* allocated (tag value is relocated). */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_BASE:
+ dyn_module->dsbt_base_tagidx = i;
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_INDEX: Contains specific request for a DSBT */
+ /* index. If this object module doesn't get */
+ /* the index it requested, then the load will */
+ /* fail (object module has already assumed */
+ /* that it got the DSBT index it asks for; */
+ /* references to the DSBT index will not have */
+ /* relocation entries associated with them). */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_INDEX:
+ dyn_module->dsbt_index = dyn_module->dyntab[i].d_un.d_val;
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_SIZE: Contains the size of the DSBT allocated for */
+ /* this object module. It must be big enough */
+ /* to hold the content of the master DSBT. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_SIZE:
+ dyn_module->dsbt_size = dyn_module->dyntab[i].d_un.d_val;
+ return TRUE;
+
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLDYN_c60_relocate_dynamic_tag_info() */
+/* */
+/* Update any target specific dynamic tag values that are associated with */
+/* a section address. Return TRUE if the tag value is successfully */
+/* updated or if the tag is not associated with a section address, and */
+/* FALSE if we can't find the sectoin associated with the tag or if the */
+/* tag type is not recognized. */
+/* */
+/*****************************************************************************/
+BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i)
+{
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*---------------------------------------------------------------------*/
+ /* These tags do not point to sections. */
+ /*---------------------------------------------------------------------*/
+ case DT_C6000_GSYM_OFFSET:
+ case DT_C6000_GSTR_OFFSET:
+ case DT_C6000_DSBT_INDEX:
+ case DT_C6000_DSBT_SIZE:
+ return TRUE;
+
+ /*---------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_BASE: This tag value provides the virtual address of */
+ /* the .dsbt section. We will go find the program */
+ /* header entry associated with the DSBT section */
+ /* and update this tag with the section's run */
+ /* address. */
+ /*---------------------------------------------------------------------*/
+ case DT_C6000_DSBT_BASE:
+ return DLIMP_update_dyntag_section_address(dyn_module, i);
+ }
+
+ DLIF_error(DLET_MISC, "Invalid dynamic tag encountered, %d\n",
+ (int)dyn_module->dyntab[i].d_tag);
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* c60_process_eiosabi() */
+/* */
+/* Process the EI_OSABI value. Verify that the OSABI is supported and set */
+/* any variables which depend on the OSABI. */
+/*****************************************************************************/
+BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module)
+{
+ uint8_t osabi = dyn_module->fhdr.e_ident[EI_OSABI];
+
+ if (dyn_module->relocatable)
+ {
+ /*-------------------------------------------------------------------*/
+ /* ELFOSABI_C6000_ELFABI - C6x Baremetal ABI */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_C6000_ELFABI)
+ return TRUE;
+
+#if 0
+ /*-------------------------------------------------------------------*/
+ /* ELFOSABI_C6000_LINUX - C6x Linux ABI */
+ /* presently unsupported */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_C6000_LINUX)
+ return TRUE;
+#endif
+ }
+ else
+ {
+ /*-------------------------------------------------------------------*/
+ /* Static executables should have an OSABI of NONE. */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_NONE)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h
new file mode 100644
index 0000000..da99604
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h
@@ -0,0 +1,53 @@
+/*
+* c60_dynamic.h
+*
+* Interface into C6x-specific dynamic loader functionality
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_C60_H
+#define DLOAD_C60_H
+
+#include "dload.h"
+
+BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i);
+BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module);
+BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module, int32_t i);
+
+#define T_INTSZ 32
+#define T_CHARSZ 8
+#define MEM_INC 8
+#define PTR_SZ 32
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h
new file mode 100644
index 0000000..418db17
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h
@@ -0,0 +1,160 @@
+/*
+* c60_elf32.h
+*
+* C6x-specific data structures for 32-bit ELF object format files.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef C60_ELF32_H
+#define C60_ELF32_H
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* C6x specific EI_OSABI values */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ ELFOSABI_C6000_ELFABI = 64, /* C6X Baremetal OSABI */
+ ELFOSABI_C6000_LINUX = 65 /* C6X Linux OSABI */
+};
+
+/*---------------------------------------------------------------------------*/
+/* File Header Flags (value of "e_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EF_C6000_REL = 0x01 /* Contains static relocations. A ET_EXEC or */
+ /* ET_DYN file w/ this flag set can be */
+ /* treated as ET_REL during static linking. */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Types (value of "p_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PT_C6000_PHATTRS = 0x70000000 /* Extended Program Header Attributes*/
+};
+
+/*---------------------------------------------------------------------------*/
+/* C6x specific section types */
+/*---------------------------------------------------------------------------*/
+enum
+{
+
+ /*------------------------------------------------------------------------*/
+ /* Section types defined by the C6x ELFABI. */
+ /* Note: ABI defined section type should be named SHT_C6000_xxx */
+ /*------------------------------------------------------------------------*/
+ SHT_C6000_UNWIND = 0x70000001, /* Exception Index Table */
+ SHT_C6000_PREEMPTMAP = 0x70000002, /* Pre-emption Map */
+
+ SHT_C6000_ATTRIBUTES = 0x70000003, /* Obj File Compatability Attributes */
+
+ /*------------------------------------------------------------------------*/
+ /* The following section types are not part of C6x ABI. As per the ABI, */
+ /* the processor specific values not defined in the ABI are reserved for */
+ /* future use. Here we reserve the range 0x7F000000 through 0x7FFFFFFFF */
+ /* for the TI specific processor section types. */
+ /* Note: TI specific section type should be named SHT_TI_xxx */
+ /*------------------------------------------------------------------------*/
+ SHT_TI_ICODE = 0x7F000000, /* ICODE representation */
+ SHT_TI_XREF = 0x7F000001, /* Symbol cross reference */
+ SHT_TI_HANDLER = 0x7F000002, /* Handler function table */
+ SHT_TI_INITINFO = 0x7F000003, /* Info for C auto-init of variables */
+ SHT_TI_PHATTRS = 0x7F000004 /* Extended Program Header Attributes*/
+};
+
+/*****************************************************************************/
+/* C6x-Specific Dynamic Array Tags (C6x ELF ABI Section ??? - AEGUPD) */
+/* NOTE: */
+/* As per GABI a tag whose value is even number indicates a dynamic tag */
+/* that uses d_ptr. Odd number indicates the use of d_val or doesn't use */
+/* neither d_val nor d_ptr. */
+/*****************************************************************************/
+enum
+{
+ /*------------------------------------------------------------------------*/
+ /* OSABI specific tags: */
+ /* From 0x6000000D thru 0x6FFFF000 */
+ /*------------------------------------------------------------------------*/
+ DT_C6000_GSYM_OFFSET = 0x6000000D, /* d_val -- OSABI Specific -- */
+ DT_C6000_GSTR_OFFSET = 0x6000000F, /* d_val -- OSABI Specific -- */
+
+ /*------------------------------------------------------------------------*/
+ /* Processor specific tags: */
+ /* From 0x70000000 thru 0x7FFFFFFF */
+ /*------------------------------------------------------------------------*/
+ DT_C6000_DSBT_BASE = 0x70000000, /* d_ptr -- Platform Specific -- */
+ DT_C6000_DSBT_SIZE = 0x70000001, /* d_val -- Platform Specific -- */
+ DT_C6000_PREEMPTMAP = 0x70000002, /* d_ptr -- Platform Specific -- */
+ DT_C6000_DSBT_INDEX = 0x70000003 /* d_val -- Platform Specific -- */
+};
+
+/*---------------------------------------------------------------------------*/
+/* C6x Dynamic Relocation Types */
+/*---------------------------------------------------------------------------*/
+typedef enum
+{
+ R_C6000_NONE = 0,
+ R_C6000_ABS32 = 1,
+ R_C6000_ABS16 = 2,
+ R_C6000_ABS8 = 3,
+ R_C6000_PCR_S21 = 4,
+ R_C6000_PCR_S12 = 5,
+ R_C6000_PCR_S10 = 6,
+ R_C6000_PCR_S7 = 7,
+ R_C6000_ABS_S16 = 8,
+ R_C6000_ABS_L16 = 9,
+ R_C6000_ABS_H16 = 10,
+ R_C6000_SBR_U15_B = 11,
+ R_C6000_SBR_U15_H = 12,
+ R_C6000_SBR_U15_W = 13,
+ R_C6000_SBR_S16 = 14,
+ R_C6000_SBR_L16_B = 15,
+ R_C6000_SBR_L16_H = 16,
+ R_C6000_SBR_L16_W = 17,
+ R_C6000_SBR_H16_B = 18,
+ R_C6000_SBR_H16_H = 19,
+ R_C6000_SBR_H16_W = 20,
+ R_C6000_SBR_GOT_U15_W = 21,
+ R_C6000_SBR_GOT_L16_W = 22,
+ R_C6000_SBR_GOT_H16_W = 23,
+ R_C6000_DSBT_INDEX = 24,
+ R_C6000_PREL31 = 25,
+ R_C6000_COPY = 26
+}C60_RELOC_TYPE;
+
+#endif /* C60_ELF32_H */
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c
new file mode 100644
index 0000000..3c79e35
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c
@@ -0,0 +1,1101 @@
+/*
+* c60_reloc.c
+*
+* Process C6x-specific dynamic relocations for core dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <limits.h>
+#include "relocate.h"
+#include "symtab.h"
+#include "c60_elf32.h"
+#include "dload_api.h"
+#include "util.h"
+#include "dload_endian.h"
+#include "c60_reloc.h"
+
+#define MASK(n,s) (((1 << n) - 1) << s)
+
+/*---------------------------------------------------------------------------*/
+/* C6x Relocations Supported */
+/* */
+/* See the C6000 ELF ABI Specification for more details. */
+/* */
+/* R_C6000_ABS32 | .field X,32 */
+/* R_C6000_ABS16 | .field X,16 */
+/* R_C6000_ABS8 | .field X,8 */
+/* R_C6000_PCR_S21 | B foo */
+/* CALLP foo, B3 */
+/* R_C6000_PCR_S12 | BNOP foo */
+/* R_C6000_PCR_S10 | BPOS foo, A10 */
+/* BDEC foo, A1 */
+/* R_C6000_PCR_S7 | ADDKPC foo, B3, 4 */
+/* R_C6000_ABS_S16 | MVK sym, A0 */
+/* R_C6000_ABS_L16 | MVKL sym, A0 */
+/* MVKLH sym, A0 */
+/* R_C6000_ABS_H16 | MVKH sym, A0 */
+/* R_C6000_SBR_U15_B | LDB *+B14(sym), A1 */
+/* ADDAB B14, sym, A1 */
+/* R_C6000_SBR_U15_H | LDH *+B14(sym), A1 */
+/* ADDAH B14, sym, A1 */
+/* R_C6000_SBR_U15_W | LDW *+B14(sym), A1 */
+/* ADDAW B14, sym, A1 */
+/* R_C6000_SBR_S16 | MVK sym-$bss, A0 */
+/* R_C6000_SBR_L16_B | MVKL (sym-$bss), A0 */
+/* R_C6000_SBR_L16_H | MVKL (sym-$bss)/2,A0 */
+/* R_C6000_SBR_L16_W | MVKL (sym-$bss)/4,A0 */
+/* R_C6000_SBR_H16_B | MVKH (sym-$bss), A0 */
+/* R_C6000_SBR_H16_H | MVKH (sym-$bss)/2,A0 */
+/* R_C6000_SBR_H16_W | MVKH (sym-$bss)/4,A0 */
+/* R_C6000_SBR_GOT_U15_W | LDW *+B14[GOT(sym)],A0 */
+/* R_C6000_SBR_GOT_L16_W | MVKL $DPR_GOT(sym), A0 */
+/* R_C6000_SBR_GOT_H16_W | MVKH $DPR_GOT(sym), A0 */
+/* R_C6000_DSBT_INDEX | LDW *+B14[$DSBT_index()], DP */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*****************************************************************************/
+/* WRITE_RELOC_R() - Perform a relocation into a buffered segment. */
+/*****************************************************************************/
+static void write_reloc_r(uint8_t* buffered_segment,
+ uint32_t segment_offset,
+ int r_type, uint32_t r)
+{
+ uint32_t* rel_field_ptr = (uint32_t*)(buffered_segment + segment_offset);
+
+#if LOADER_DEBUG
+ /*------------------------------------------------------------------------*/
+ /* Print some details about the relocation we are about to process. */
+ /*------------------------------------------------------------------------*/
+ if(debugging_on)
+ {
+ DLIF_trace("RWRT: segment_offset: %d\n", segment_offset);
+ DLIF_trace("RWRT: buffered_segment: 0x%x\n",
+ (uint32_t)buffered_segment);
+ DLIF_trace("RWRT: rel_field_ptr: 0x%x\n", (uint32_t)rel_field_ptr);
+ DLIF_trace("RWRT: result: 0x%x\n", r);
+ }
+#endif
+
+
+ /*------------------------------------------------------------------------*/
+ /* Given the relocation type, carry out relocation into a 4 byte packet */
+ /* within the buffered segment. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ case R_C6000_ABS32:
+ *rel_field_ptr = r;
+ break;
+ case R_C6000_PREL31:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(30,0)) | r;
+ break;
+ case R_C6000_ABS16:
+ *((uint16_t*)(buffered_segment + segment_offset)) = r;
+ break;
+ case R_C6000_ABS8:
+ *((uint8_t*)(buffered_segment + segment_offset)) = r;
+ break;
+ case R_C6000_PCR_S21:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(21,7)) | (r << 7);
+ break;
+ case R_C6000_PCR_S12:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(12,16)) | (r << 16);
+ break;
+ case R_C6000_PCR_S10:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(10,13)) | (r << 13);
+ break;
+ case R_C6000_PCR_S7:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(7,16)) | (r << 16);
+ break;
+
+ case R_C6000_ABS_S16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+ case R_C6000_ABS_L16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+ case R_C6000_ABS_H16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+
+ case R_C6000_SBR_U15_B:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+ case R_C6000_SBR_U15_H:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+ case R_C6000_SBR_U15_W:
+ case R_C6000_DSBT_INDEX:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "write_reloc_r called with invalid relocation type!\n");
+ }
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("reloc_field 0x%x\n", *rel_field_ptr);
+#endif
+}
+
+/*****************************************************************************/
+/* PACK_RESULT() - Pack the result of a relocation calculation for storage */
+/* in the relocation field. */
+/*****************************************************************************/
+static int32_t pack_result(int32_t unpacked_result, int r_type)
+{
+ switch(r_type)
+ {
+ case R_C6000_ABS32:
+ case R_C6000_ABS16:
+ case R_C6000_ABS8:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ return unpacked_result;
+
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_PREL31:
+ return unpacked_result >> 1;
+
+ case R_C6000_PCR_S21:
+ case R_C6000_PCR_S12:
+ case R_C6000_PCR_S10:
+ case R_C6000_PCR_S7:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_DSBT_INDEX:
+ return unpacked_result >> 2;
+
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_H16_B:
+ return unpacked_result >> 16;
+
+ case R_C6000_SBR_H16_H:
+ return unpacked_result >> 17;
+
+ case R_C6000_SBR_H16_W:
+ return unpacked_result >> 18;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "pack_result called with invalid relocation type!\n");
+ return 0;
+ }
+}
+
+/*****************************************************************************/
+/* MASK_RESULT() - Mask the result of a relocation calculation so that it */
+/* fits the size of the relocation type's field. */
+/*****************************************************************************/
+static int32_t mask_result(int32_t unmasked_result, int r_type)
+{
+ switch(r_type)
+ {
+ case R_C6000_ABS8:
+ return unmasked_result & 0xFF;
+
+ case R_C6000_ABS32:
+ return unmasked_result;
+
+ case R_C6000_ABS16:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ return unmasked_result & 0xFFFF;
+
+ case R_C6000_PCR_S21:
+ return unmasked_result & 0x1FFFFF;
+
+ case R_C6000_PCR_S12:
+ return unmasked_result & 0xFFF;
+
+ case R_C6000_PCR_S10:
+ return unmasked_result & 0x3FF;
+
+ case R_C6000_PCR_S7:
+ return unmasked_result & 0x7F;
+
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_DSBT_INDEX:
+ return unmasked_result & 0x7FFF;
+
+ case R_C6000_PREL31:
+ return unmasked_result & 0x7FFFFFFF;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "mask_result called with invalid relocation type!\n");
+ return 0;
+ }
+}
+
+/*****************************************************************************/
+/* REL_OVERFLOW() */
+/* */
+/* Check relocation value against the range associated with a given */
+/* relocation type field size and signedness. */
+/* */
+/*****************************************************************************/
+static BOOL rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* Select appropriate range check based on relocation type. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ case R_C6000_ABS16: return ((reloc_value > 65535) ||
+ (reloc_value < -32768));
+ case R_C6000_ABS8: return ((reloc_value > 255) ||
+ (reloc_value < -128));
+ case R_C6000_PCR_S21: return ((reloc_value >= 0x400000) ||
+ (reloc_value < -0x400000));
+ case R_C6000_PCR_S12: return ((reloc_value >= 0x2000) ||
+ (reloc_value < -0x2000));
+ case R_C6000_PCR_S10: return ((reloc_value >= 0x800) ||
+ (reloc_value < -0x800));
+ case R_C6000_PCR_S7: return ((reloc_value >= 0x100) ||
+ (reloc_value < -0x100));
+ case R_C6000_SBR_S16:
+ case R_C6000_ABS_S16: return ((reloc_value >= 0x8000) ||
+ (reloc_value < -0x8000));
+ case R_C6000_SBR_U15_B: return (((uint32_t)reloc_value) >= 0x8000);
+ case R_C6000_SBR_U15_H: return (((uint32_t)reloc_value) >= 0xFFFF);
+ case R_C6000_DSBT_INDEX:
+ case R_C6000_SBR_U15_W: return (((uint32_t)reloc_value) >= 0x1FFFD);
+
+
+ /*---------------------------------------------------------------------*/
+ /* Some relocation types suppress overflow checking at link-time. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ return 0;
+
+ /*---------------------------------------------------------------------*/
+ /* 32-bit relocation field values are not checked for overflow. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS32:
+ case R_C6000_PREL31:
+ return 0;
+
+ /*---------------------------------------------------------------------*/
+ /* If relocation type did not appear in the above switch, then we */
+ /* didn't expect to see it. */
+ /*---------------------------------------------------------------------*/
+ default:
+ DLIF_error(DLET_RELOC,
+ "rel_overflow called with invalid relocation type!\n");
+ }
+
+ return 1;
+}
+
+#if LOADER_DEBUG || LOADER_PROFILE
+extern int DLREL_relocations;
+extern time_t DLREL_total_reloc_time;
+#endif
+
+/*****************************************************************************/
+/* RELOC_DO() - Process a single relocation entry. */
+/*****************************************************************************/
+static void reloc_do(C60_RELOC_TYPE r_type,
+ uint32_t segment_vaddr,
+ uint8_t *segment_buffer,
+ uint32_t addend,
+ uint32_t symval,
+ uint32_t spc,
+ int wrong_endian,
+ uint32_t base_pointer,
+ int32_t dsbt_index)
+{
+ int32_t reloc_value = 0;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* In debug mode, keep a count of the number of relocations processed. */
+ /* In profile mode, start the clock on a given relocation. */
+ /*------------------------------------------------------------------------*/
+ int start_time = 0;
+ if (debugging_on || profiling_on)
+ {
+ DLREL_relocations++;
+ if (profiling_on) start_time = clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Calculate the relocation value according to the rules associated with */
+ /* the given relocation type. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Straight-Up Address relocations (address references). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS32:
+ case R_C6000_ABS16:
+ case R_C6000_ABS8:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ reloc_value = symval + addend;
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* PC-Relative relocations (calls and branches). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_PCR_S21:
+ case R_C6000_PCR_S12:
+ case R_C6000_PCR_S10:
+ case R_C6000_PCR_S7:
+ {
+ /*------------------------------------------------------------------*/
+ /* Add SPC to segment address to get the PC. Mask for exec-packet */
+ /* boundary. */
+ /*------------------------------------------------------------------*/
+ int32_t opnd_p = (spc + segment_vaddr) & 0xffffffe0;
+ reloc_value = symval + addend - opnd_p;
+ break;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* "Place"-relative relocations (TDEH). */
+ /*---------------------------------------------------------------------*/
+ /* These relocations occur in data and refer to a label that occurs */
+ /* at some signed 32-bit offset from the place where the relocation */
+ /* occurs. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_PREL31:
+ {
+ /*------------------------------------------------------------------*/
+ /* Compute location of relocation entry and subtract it from the */
+ /* address of the location being referenced (it is computed very */
+ /* much like a PC-relative relocation, but it occurs in data and */
+ /* is called a "place"-relative relocation). */
+ /*------------------------------------------------------------------*/
+ /* If this is an Elf32_Rel type relocation, then addend is assumed */
+ /* to have been scaled when it was unpacked (field << 1). */
+ /*------------------------------------------------------------------*/
+ /* For Elf32_Rela type relocations the addend is assumed to be a */
+ /* signed 32-bit integer value. */
+ /*------------------------------------------------------------------*/
+ /* Offset is not fetch-packet relative; doesn't need to be masked. */
+ /*------------------------------------------------------------------*/
+ int32_t opnd_p = (spc + segment_vaddr);
+ reloc_value = symval + addend - opnd_p;
+ break;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Static-Base Relative relocations (near-DP). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ reloc_value = symval + addend - base_pointer;
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* R_C6000_DSBT_INDEX - uses value assigned by the dynamic loader to */
+ /* be the DSBT index for this module as a scaled offset when */
+ /* referencing the DSBT. The DSBT base address is in symval and the */
+ /* static base is in base_pointer. DP-relative offset to slot in */
+ /* DSBT is the offset of the DSBT relative to the DP plus the */
+ /* scaled DSBT index into the DSBT. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_DSBT_INDEX:
+ reloc_value = ((symval + addend) - base_pointer) + (dsbt_index << 2);
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocation: after DSO initialization, */
+ /* copy the named object from the DSO into the executable's BSS */
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ /*---------------------------------------------------------------------*/
+ /* Unrecognized relocation type. */
+ /*---------------------------------------------------------------------*/
+ default:
+ DLIF_error(DLET_RELOC,
+ "reloc_do called with invalid relocation type!\n");
+ break;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Overflow checking. Is relocation value out of range for the size and */
+ /* type of the current relocation? */
+ /*------------------------------------------------------------------------*/
+ if (rel_overflow(r_type, reloc_value))
+ DLIF_error(DLET_RELOC, "relocation overflow!\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Move relocation value to appropriate offset for relocation field's */
+ /* location. */
+ /*------------------------------------------------------------------------*/
+ reloc_value = pack_result(reloc_value, r_type);
+
+ /*------------------------------------------------------------------------*/
+ /* Mask packed result to the size of the relocation field. */
+ /*------------------------------------------------------------------------*/
+ reloc_value = mask_result(reloc_value, r_type);
+
+ /*------------------------------------------------------------------------*/
+ /* If necessary, Swap endianness of data at relocation address. */
+ /*------------------------------------------------------------------------*/
+ if (wrong_endian)
+ DLIMP_change_endian32((int32_t*)(segment_buffer + spc));
+
+ /*------------------------------------------------------------------------*/
+ /* Write the relocated 4-byte packet back to the segment buffer. */
+ /*------------------------------------------------------------------------*/
+ write_reloc_r(segment_buffer, spc, r_type, reloc_value);
+
+ /*------------------------------------------------------------------------*/
+ /* Change endianness of segment address back to original. */
+ /*------------------------------------------------------------------------*/
+ if (wrong_endian)
+ DLIMP_change_endian32((int32_t*)(segment_buffer + spc));
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* In profile mode, add elapsed time for this relocation to total time */
+ /* spent doing relocations. */
+ /*------------------------------------------------------------------------*/
+ if (profiling_on)
+ DLREL_total_reloc_time += (clock() - start_time);
+ if (debugging_on)
+ DLIF_trace("reloc_value = 0x%x\n", reloc_value);
+#endif
+}
+
+/*****************************************************************************/
+/* REL_UNPACK_ADDEND() */
+/* */
+/* Unpack addend value from the relocation field. */
+/* */
+/*****************************************************************************/
+static void rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t *address,
+ uint32_t *addend)
+{
+ /*------------------------------------------------------------------------*/
+ /* C6000 does not support Elf32_Rel type relocations in the dynamic */
+ /* loader core. We will emit an internal error and abort until this */
+ /* support is added. I abort here because this is necessarily a target- */
+ /* specific part of the relocation infrastructure. */
+ /*------------------------------------------------------------------------*/
+ *addend = 0;
+
+ DLIF_error(DLET_RELOC,
+ "Internal Error: unpacking addend values from the relocation "
+ "field is not supported in the C6000 dynamic loader at this "
+ "time; aborting\n");
+ DLIF_exit(1);
+}
+
+/*****************************************************************************/
+/* REL_SWAP_ENDIAN() */
+/* */
+/* Return TRUE if we should change the endianness of a relocation field. */
+/* */
+/*****************************************************************************/
+static BOOL rel_swap_endian(DLIMP_Dynamic_Module *dyn_module,
+ C60_RELOC_TYPE r_type)
+{
+ if (dyn_module->wrong_endian) return TRUE;
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* REL_CHANGE_ENDIAN() */
+/* */
+/* Change the endianness of the relocation field at the specified address */
+/* in the segment's data. */
+/* */
+/*****************************************************************************/
+static void rel_change_endian(C60_RELOC_TYPE r_type, uint8_t *address)
+{
+ /*------------------------------------------------------------------------*/
+ /* On C6000, all instructions are 32-bits wide. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_change_endian32((int32_t *)address);
+}
+
+/*****************************************************************************/
+/* READ_REL_TABLE() */
+/* */
+/* Read in an Elf32_Rel type relocation table. This function allocates */
+/* host memory for the table. */
+/* */
+/*****************************************************************************/
+static void read_rel_table(struct Elf32_Rel **rel_table,
+ int32_t table_offset,
+ uint32_t relnum, uint32_t relent,
+ LOADER_FILE_DESC *fd, BOOL wrong_endian)
+{
+ if (relnum == 0) { *rel_table = NULL; return; }
+
+ *rel_table = (struct Elf32_Rel *)DLIF_malloc(relnum * relent);
+ DLIF_fseek(fd, table_offset, LOADER_SEEK_SET);
+ DLIF_fread(*rel_table, relnum, relent, fd);
+
+ if (wrong_endian)
+ {
+ int i;
+ for (i = 0; i < relnum; i++)
+ DLIMP_change_rel_endian(*rel_table + i);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_REL_TABLE() */
+/* */
+/* Process table of Elf32_Rel type relocations. */
+/* */
+/*****************************************************************************/
+static void process_rel_table(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Segment* seg,
+ struct Elf32_Rel *rel_table,
+ uint32_t relnum,
+ int32_t *start_relidx,
+ uint32_t ti_static_base,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ Elf32_Addr seg_start_addr = seg->input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz;
+ BOOL found = FALSE;
+ int32_t relidx = *start_relidx;
+
+ /*------------------------------------------------------------------------*/
+ /* If the given start reloc index is out of range, then start from the */
+ /* beginning of the given table. */
+ /*------------------------------------------------------------------------*/
+ if (relidx >= relnum) relidx = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through Elf32_Rel type relocation table. */
+ /*------------------------------------------------------------------------*/
+ for ( ; relidx < relnum; relidx++)
+ {
+ /*---------------------------------------------------------------------*/
+ /* If the relocation offset falls within the segment, process it. */
+ /*---------------------------------------------------------------------*/
+ if (rel_table[relidx].r_offset >= seg_start_addr &&
+ rel_table[relidx].r_offset < seg_end_addr)
+ {
+ Elf32_Addr r_symval = 0;
+ C60_RELOC_TYPE r_type =
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info);
+ int32_t r_symid = ELF32_R_SYM(rel_table[relidx].r_info);
+
+ uint8_t *reloc_address = NULL;
+ uint32_t pc = 0;
+ uint32_t addend = 0;
+
+ BOOL change_endian = FALSE;
+
+ found = TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* If symbol definition is not found, don't do the relocation. */
+ /* An error is generated by the lookup function. */
+ /*------------------------------------------------------------------*/
+ if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval))
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Addend value is stored in the relocation field. */
+ /* We'll need to unpack it from the data for the segment that is */
+ /* currently being relocated. */
+ /*------------------------------------------------------------------*/
+ pc = rel_table[relidx].r_offset - seg->input_vaddr;
+ reloc_address = (uint8_t *)seg->host_address + pc;
+
+ change_endian = rel_swap_endian(dyn_module, r_type);
+ if (change_endian)
+ rel_change_endian(r_type, reloc_address);
+
+ rel_unpack_addend(
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info),
+ reloc_address, &addend);
+
+ /*------------------------------------------------------------------*/
+ /* Perform actual relocation. This is a really wide function */
+ /* interface and could do with some encapsulation. */
+ /*------------------------------------------------------------------*/
+ reloc_do(r_type,
+ seg->phdr.p_vaddr,
+ seg->host_address,
+ addend,
+ r_symval,
+ pc,
+ dyn_module->wrong_endian,
+ ti_static_base,
+ dyn_module->dsbt_index);
+
+ }
+
+ else if (found)
+ break;
+ }
+}
+
+/*****************************************************************************/
+/* READ_RELA_TABLE() */
+/* */
+/* Read in an Elf32_Rela type relocation table. This function allocates */
+/* host memory for the table. */
+/* */
+/*****************************************************************************/
+static void read_rela_table(struct Elf32_Rela **rela_table,
+ int32_t table_offset,
+ uint32_t relanum, uint32_t relaent,
+ LOADER_FILE_DESC *fd, BOOL wrong_endian)
+{
+ if (relanum == 0) { *rela_table = NULL; return; }
+ *rela_table = (struct Elf32_Rela *)DLIF_malloc(relanum * relaent);
+ DLIF_fseek(fd, table_offset, LOADER_SEEK_SET);
+ DLIF_fread(*rela_table, relanum, relaent, fd);
+
+ if (wrong_endian)
+ {
+ int i;
+ for (i = 0; i < relanum; i++)
+ DLIMP_change_rela_endian(*rela_table + i);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_RELA_TABLE() */
+/* */
+/* Process a table of Elf32_Rela type relocations. */
+/* */
+/*****************************************************************************/
+static void process_rela_table(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Segment *seg,
+ struct Elf32_Rela *rela_table,
+ uint32_t relanum,
+ int32_t *start_relidx,
+ uint32_t ti_static_base,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ Elf32_Addr seg_start_addr = seg->input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz;
+ BOOL found = FALSE;
+ int32_t relidx = *start_relidx;
+
+ /*-----------------------------------------------------------------------*/
+ /* If the given start reloc index is out of range, then start from */
+ /* the beginning of the given table. */
+ /*-----------------------------------------------------------------------*/
+ if (relidx > relanum) relidx = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* Spin through RELA relocation table. */
+ /*-----------------------------------------------------------------------*/
+ for ( ; relidx < relanum; relidx++)
+ {
+ /*-------------------------------------------------------------------*/
+ /* If the relocation offset falls within the segment, process it. */
+ /*-------------------------------------------------------------------*/
+ if (rela_table[relidx].r_offset >= seg_start_addr &&
+ rela_table[relidx].r_offset < seg_end_addr)
+ {
+ Elf32_Addr r_symval;
+ C60_RELOC_TYPE r_type =
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rela_table[relidx].r_info);
+ int32_t r_symid = ELF32_R_SYM(rela_table[relidx].r_info);
+
+ found = TRUE;
+
+ /*---------------------------------------------------------------*/
+ /* If symbol definition is not found, don't do the relocation. */
+ /* An error is generated by the lookup function. */
+ /*---------------------------------------------------------------*/
+ if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval))
+ continue;
+
+ /*---------------------------------------------------------------*/
+ /* Perform actual relocation. This is a really wide function */
+ /* interface and could do with some encapsulation. */
+ /*---------------------------------------------------------------*/
+ reloc_do(r_type,
+ seg->phdr.p_vaddr,
+ seg->host_address,
+ rela_table[relidx].r_addend,
+ r_symval,
+ rela_table[relidx].r_offset - seg->input_vaddr,
+ dyn_module->wrong_endian,
+ ti_static_base,
+ dyn_module->dsbt_index);
+ }
+
+ else if (found)
+ break;
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_GOT_RELOCS() */
+/* */
+/* Process all GOT relocations. It is possible to have both Elf32_Rel */
+/* and Elf32_Rela type relocations in the same file, so we handle tham */
+/* both. */
+/* */
+/*****************************************************************************/
+static void process_got_relocs(DLOAD_HANDLE handle,
+ struct Elf32_Rel* rel_table, uint32_t relnum,
+ struct Elf32_Rela* rela_table, uint32_t relanum,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ DLIMP_Loaded_Segment *seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size;
+ int32_t rel_relidx = 0;
+ int32_t rela_relidx = 0;
+ uint32_t seg_idx = 0;
+ uint32_t ti_static_base = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Get the value of the static base (__TI_STATIC_BASE) which will be */
+ /* passed into the relocation table processing functions. */
+ /*------------------------------------------------------------------------*/
+ if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum, &ti_static_base))
+ DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Process relocations segment by segment. */
+ /*------------------------------------------------------------------------*/
+ for (seg_idx = 0; seg_idx < num_segs; seg_idx++)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Relocations should not occur in uninitialized segments. */
+ /*---------------------------------------------------------------------*/
+ if (!seg[seg_idx].phdr.p_filesz) continue;
+
+ if (rela_table)
+ process_rela_table(handle, (seg + seg_idx),
+ rela_table, relanum, &rela_relidx,
+ ti_static_base, dyn_module);
+
+ if (rel_table)
+ process_rel_table(handle, (seg + seg_idx),
+ rel_table, relnum, &rel_relidx,
+ ti_static_base, dyn_module);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_PLTGOT_RELOCS() */
+/* */
+/* Process all PLTGOT relocation entries. The PLTGOT relocation table */
+/* can be either Elf32_Rel or Elf32_Rela type. All PLTGOT relocations */
+/* ar guaranteed to belong to the same segment. */
+/* */
+/*****************************************************************************/
+static void process_pltgot_relocs(DLOAD_HANDLE handle,
+ void* plt_reloc_table,
+ int reltype,
+ uint32_t pltnum,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ Elf32_Addr r_offset = (reltype == DT_REL) ?
+ ((struct Elf32_Rel *)plt_reloc_table)->r_offset :
+ ((struct Elf32_Rela *)plt_reloc_table)->r_offset;
+
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+
+ uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size;
+ int32_t plt_relidx = 0;
+ uint32_t seg_idx = 0;
+ uint32_t ti_static_base = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Get the value of the static base (__TI_STATIC_BASE) which will be */
+ /* passed into the relocation table processing functions. */
+ /*------------------------------------------------------------------------*/
+ if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum, &ti_static_base))
+ DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n");
+
+ /*------------------------------------------------------------------------*/
+ /* For each segment s, check if the relocation falls within s. If so, */
+ /* then all other relocations are guaranteed to fall with s. Process */
+ /* all relocations and then return. */
+ /*------------------------------------------------------------------------*/
+ for (seg_idx = 0; seg_idx < num_segs; seg_idx++)
+ {
+ Elf32_Addr seg_start_addr = seg[seg_idx].input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg[seg_idx].phdr.p_memsz;
+
+ /*---------------------------------------------------------------------*/
+ /* Relocations should not occur in uninitialized segments. */
+ /*---------------------------------------------------------------------*/
+ if(!seg[seg_idx].phdr.p_filesz) continue;
+
+ if (r_offset >= seg_start_addr &&
+ r_offset < seg_end_addr)
+ {
+ if (reltype == DT_REL)
+ process_rel_table(handle, (seg + seg_idx),
+ (struct Elf32_Rel *)plt_reloc_table,
+ pltnum, &plt_relidx,
+ ti_static_base, dyn_module);
+ else
+ process_rela_table(handle, (seg + seg_idx),
+ (struct Elf32_Rela *)plt_reloc_table,
+ pltnum, &plt_relidx,
+ ti_static_base, dyn_module);
+
+ break;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */
+/* file that we are in the process of loading and relocating. */
+/*****************************************************************************/
+void DLREL_c60_relocate(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module)
+{
+ struct Elf32_Dyn *dyn_nugget = dyn_module->dyntab;
+ struct Elf32_Rela *rela_table = NULL;
+ struct Elf32_Rel *rel_table = NULL;
+ struct Elf32_Rela *rela_plt_table = NULL;
+ struct Elf32_Rel *rel_plt_table = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_RELASZ) and the size per */
+ /* relocation (DT_RELAENT) from the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ uint32_t relasz = DLIMP_get_first_dyntag(DT_RELASZ, dyn_nugget);
+ uint32_t relaent = DLIMP_get_first_dyntag(DT_RELAENT, dyn_nugget);
+ uint32_t relanum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_RELSZ) and the size per */
+ /* relocation (DT_RELENT) from the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ uint32_t relsz = DLIMP_get_first_dyntag(DT_RELSZ, dyn_nugget);
+ uint32_t relent = DLIMP_get_first_dyntag(DT_RELENT, dyn_nugget);
+ uint32_t relnum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_PLTRELSZ) and the type of */
+ /* of the PLTGOT relocation table (DT_PLTREL): one of DT_REL or DT_RELA */
+ /*------------------------------------------------------------------------*/
+ uint32_t pltrelsz = DLIMP_get_first_dyntag(DT_PLTRELSZ, dyn_nugget);
+ int pltreltyp = DLIMP_get_first_dyntag(DT_PLTREL, dyn_nugget);
+ uint32_t pltnum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Find/record DSBT index associated with this module. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module) &&
+ (dyn_module->dsbt_index == DSBT_INDEX_INVALID))
+ dyn_module->dsbt_index =
+ DLIF_get_dsbt_index(dyn_module->loaded_module->file_handle);
+
+ /*------------------------------------------------------------------------*/
+ /* Read the PLTGOT relocation table from the file */
+ /* The PLTGOT table is a subsection at the end of either the DT_REL or */
+ /* DT_RELA table. The size of the table it belongs to DT_REL(A)SZ */
+ /* includes the size of the PLTGOT table. So it must be adjusted so that */
+ /* the GOT relocation tables only contain actual GOT relocations. */
+ /*------------------------------------------------------------------------*/
+ if (pltrelsz != INT_MAX && pltrelsz != 0)
+ {
+ if (pltreltyp == DT_REL)
+ {
+ pltnum = pltrelsz/relent;
+ relsz -= pltrelsz;
+ read_rel_table((&rel_plt_table),
+ DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget),
+ pltnum, relent, fd, dyn_module->wrong_endian);
+ }
+
+ else if (pltreltyp == DT_RELA)
+ {
+ pltnum = pltrelsz/relaent;
+ relasz -= pltrelsz;
+ read_rela_table((&rela_plt_table),
+ DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget),
+ pltnum, relaent, fd, dyn_module->wrong_endian);
+ }
+
+ else
+ {
+ DLIF_error(DLET_RELOC,
+ "DT_PLTREL is invalid: must be either %d or %d\n",
+ DT_REL, DT_RELA);
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the DT_RELA GOT relocation table from the file */
+ /*------------------------------------------------------------------------*/
+ if (relasz != INT_MAX && relasz != 0)
+ {
+ relanum = relasz/relaent;
+ read_rela_table(&rela_table, DLIMP_get_first_dyntag(DT_RELA, dyn_nugget),
+ relanum, relaent, fd, dyn_module->wrong_endian);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the DT_REL GOT relocation table from the file */
+ /*------------------------------------------------------------------------*/
+ if (relsz != INT_MAX && relsz != 0)
+ {
+ relnum = relsz/relent;
+ read_rel_table(&rel_table, DLIMP_get_first_dyntag(DT_REL, dyn_nugget),
+ relnum, relent, fd, dyn_module->wrong_endian);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Process the PLTGOT relocations */
+ /*------------------------------------------------------------------------*/
+ if (rela_plt_table)
+ process_pltgot_relocs(handle, rela_plt_table, pltreltyp, pltnum,
+ dyn_module);
+
+ if (rel_plt_table)
+ process_pltgot_relocs(handle, rel_plt_table, pltreltyp, pltnum,
+ dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Process the GOT relocations */
+ /*------------------------------------------------------------------------*/
+ if (rel_table || rela_table)
+ process_got_relocs(handle, rel_table, relnum, rela_table, relanum,
+ dyn_module);
+
+ /*-------------------------------------------------------------------------*/
+ /* Free memory used for ELF relocation table copies. */
+ /*-------------------------------------------------------------------------*/
+ if (rela_table) DLIF_free(rela_table);
+ if (rel_table) DLIF_free(rel_table);
+ if (rela_plt_table) DLIF_free(rela_plt_table);
+ if (rel_plt_table) DLIF_free(rel_plt_table);
+}
+
+/*****************************************************************************/
+/* UNIT TESTING INTERFACE */
+/*****************************************************************************/
+#ifdef UNIT_TEST
+void unit_c60_reloc_do(C60_RELOC_TYPE r_type,
+ uint8_t *address_space,
+ uint32_t addend, uint32_t symval, uint32_t pc,
+ uint32_t static_base, int wrong_endian,
+ int32_t dsbt_index)
+{
+ reloc_do(r_type, (uint32_t)address_space, address_space,
+ addend, symval, pc, FALSE, static_base, dsbt_index);
+}
+
+#if 0 /* RELA TYPE RELOCATIONS HAVE ADDEND IN RELOCATION ENTRY */
+void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t* address,
+ uint32_t* addend)
+{
+ rel_unpack_addend(r_type, address, addend);
+}
+#endif
+
+BOOL unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value)
+{
+ return rel_overflow(r_type, reloc_value);
+}
+#endif
+
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h
new file mode 100644
index 0000000..8ccd60e
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+void DLREL_c60_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module);
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp
new file mode 100644
index 0000000..acde023
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp
@@ -0,0 +1,825 @@
+/*
+* test_c60_reloc.cpp
+*
+* C6x Relocation Unit Tests.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "test_c60_reloc.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/*****************************************************************************/
+/* C60_TestRelocDo */
+/* */
+/* Tests the C60 version of reloc_do. In cases where multiple relocation */
+/* types are implemented in the same way, only one type is tested. For */
+/* instance, R_C6000_xxx, R_C6000_yyy, and R_C6000_zzz are implemented in */
+/* the exact same way and, therefore, only R_C6000_xxx is tested. */
+/* */
+/* Each test follows the same flow: */
+/* 1. A valid instruction is constructed for the relocation type being */
+/* tested. */
+/* 2. Addend, symbol value, and pc are then created. */
+/* (NOTE: static base is not needed, and so 0 is passed. Also, same */
+/* endianness is assumed.) */
+/* 3. reloc_do() is called */
+/* 4. The result is checked. */
+/* 5. Repeat if variations should be considered. */
+/* */
+/*****************************************************************************/
+//void C60_TestRelocDo::test_R_C6000_NONE() { }
+
+void C60_TestRelocDo::test_R_C6000_ABS32()
+{
+ uint32_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x2001000;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS32,
+ (uint8_t*) &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x2001004);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS16()
+{
+ uint16_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xFFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS16,
+ (uint8_t*) &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x1002);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS8()
+{
+ uint8_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS8,
+ &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x12);
+}
+
+/*---------------------------------------------------------------------------*/
+/* PC-Relative Relocation Tests */
+/* */
+/* Our relocation handler assumes that the address of 'opcode' is where the */
+/* relocation is. Therefore, when creating a PCR test case, we will compute */
+/* a value for symval and pc in terms of &opcode. */
+/* */
+/*---------------------------------------------------------------------------*/
+void C60_TestRelocDo::test_R_C6000_PCR_S21()
+{
+ uint32_t opcode = 0x00000010;
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50000;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR21 offset = 0x14001 */
+ unit_c60_reloc_do(R_C6000_PCR_S21,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x00a00090);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR21 offset = 0x1d4001 (signed - negative) */
+ opcode = 0x00000010;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0000;
+ unit_c60_reloc_do(R_C6000_PCR_S21,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0ea00090);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S12()
+{
+ uint32_t opcode = 0x00002120; /* BNOP */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x500;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR12 offset = 0x141 */
+ unit_c60_reloc_do(R_C6000_PCR_S12,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x01412120);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR12 offset = 0xd41 (signed - negative) */
+ opcode = 0x00002120;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb00;
+ unit_c60_reloc_do(R_C6000_PCR_S12,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0d412120);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S10()
+{
+ uint32_t opcode = 0x01001020; /* BDEC */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR10 offset = 0x15 */
+ unit_c60_reloc_do(R_C6000_PCR_S10,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0102b020);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR10 offset = 0x355 (signed - negative) */
+ opcode = 0x01001020;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0;
+ unit_c60_reloc_do(R_C6000_PCR_S10,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x017ab020);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S7()
+{
+ uint32_t opcode = 0x03006160; /* ADDKPC */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR7 offset = 0x15 */
+ unit_c60_reloc_do(R_C6000_PCR_S7,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03156160);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR7 offset = 0x75 (signed - negative) */
+ opcode = 0x03006160;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0x30;
+ unit_c60_reloc_do(R_C6000_PCR_S7,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03756160);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_S16()
+{
+ uint32_t opcode = 0x03000028; /* MVK */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xFFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03080128);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_L16()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x04560FFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_L16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03080128);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_H16()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x04560FFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_H16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03022b68);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_B()
+{
+ uint32_t opcode = 0x0300002c; /* LDB */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1357);
+ uint32_t pc = 0x0;
+
+ /* unsigned 15-bit SBR offset = 0x1357 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0313572c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_H()
+{
+ uint32_t opcode = 0x0300004c; /* LDH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x2246);
+ uint32_t pc = 0x0;
+
+ /* unsigned 16-bit SBR offset = 0x2246 */
+ /* scaled 15-bit SBR offset = 0x1123 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0311234c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_W()
+{
+ uint32_t opcode = 0x0300006c; /* LDW */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x448c);
+ uint32_t pc = 0x0;
+
+ /* unsigned 17-bit SBR offset = 0x448c */
+ /* scaled 15-bit SBR offset = 0x1123 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0311236c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_S16()
+{
+ uint32_t opcode = 0x03000028; /* MVK */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1357);
+ uint32_t pc = 0x0;
+
+ /* Test #1 positive signed 16-bit offset */
+ /* 16-bit SBR offset = 0x1357 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0309aba8);
+
+ /* Test #2 negative signed 16-bit offset */
+ /* 16-bit SBR offset = 0xeca9 (-0x1357) */
+ /* encoded in bits 22-7 of opcode */
+ symval = (static_base - 0x1357);
+ unit_c60_reloc_do(R_C6000_SBR_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x037654a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_B()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x11123);
+ uint32_t pc = 0x0;
+
+ /* 16-bit SBR offset = 0x1123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x030891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_H()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x12246);
+ uint32_t pc = 0x0;
+
+ /* 17-bit SBR offset = 0x12246 */
+ /* scaled SBR offset = 0x9123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x034891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_W()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1448c);
+ uint32_t pc = 0x0;
+
+ /* 18-bit SBR offset = 0x1448c */
+ /* scaled SBR offset = 0x5123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x032891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_B()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* upper 16-bits of SBR offset = 0x357 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0301abe8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_H()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* scaled SBR offset = 0x1aba246 */
+ /* upper 16-bits of scaled SBR offset = 0x1ab */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0300d5e8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_W()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* scaled SBR offset = 0x0d5d123 */
+ /* upper 16-bits of scaled SBR offset = 0x0d5 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03006ae8);
+}
+
+/* The DSBT table is accessed via DP-relative addressing with */
+/* an LDW instruction, but the DSBT_INDEX is really an index */
+/* into the DSBT table, the index is scaled to a 4-word offset. */
+void C60_TestRelocDo::test_R_C6000_DSBT_INDEX()
+{
+ uint32_t opcode = 0x0300006c; /* LDW */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = static_base;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_DSBT_INDEX,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 3);
+
+ TS_ASSERT_EQUALS(opcode, 0x0300036c);
+}
+
+/*****************************************************************************/
+/* C60_TestRelUnpackAddend */
+/* */
+/* Tests the C60 rel_unpack_addend function. */
+/* */
+/* In cases where the addends are unpacked in the same way, only one is */
+/* tested. */
+/* */
+/* All tests follow the same flow: */
+/* */
+/* 1. Create a valid instruction for the relocation type, where the addend */
+/* is packed in the instruction. */
+/* 2. Call rel_unpack_addend(). */
+/* 3. Check that the addend is correct. */
+/* */
+/* Relocations may be tested multiple times to handle variations, such as */
+/* positive/negative addends, extra bits depending on the encoding, etc. */
+/* */
+/* NOTE!! C60 ONLY SUPPORTS RELA TYPE RELOCATIONS, SO ADDEND FIELD IS STORED */
+/* IN RELOCATION ENTRY ITSELF. */
+/*****************************************************************************/
+#if 0
+void C60_TestRelUnpackAddend::test_R_C6000_ABS32()
+{
+ uint32_t address_space=0xFEDCBA9;
+ uint32_t addend;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS32,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, address_space);
+}
+
+void C60_TestRelUnpackAddend::test_R_C6000_ABS16()
+{
+ uint16_t address_space=0x7FFF;
+ uint32_t addend;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS16,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, 0x7FFF);
+
+ address_space = 0x8000;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS16,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, 0xFFFF8000);
+}
+#endif
+
+
+/*****************************************************************************/
+/* C60_TestRelOverflow */
+/* */
+/* Test the C60 rel_overflow function. */
+/* */
+/* In each case, we test the upper and lower bounds of each relocation type. */
+/* Only relocation types where the overflow is checked in rel_overflow are */
+/* considered. In most cases four tests are performed to test the upper and */
+/* lower bounds (1 pass and 1 fail for each). */
+/* */
+/* NOTE!! HAVEN'T REFACTORED OVERFLOW CHECK OUT OF RELOCATION HANDLERS FOR */
+/* C60, SO OVERFLOW SHOULD BE TESTED AS PART OF THE RELOC DO(???) */
+/* */
+/*****************************************************************************/
+void C60_TestRelOverflow::test_R_C6000_ABS16()
+{
+ int32_t reloc_val = 0xFFFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x10000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_ABS8()
+{
+ int32_t reloc_val = 0xFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x80;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x81;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S21()
+{
+ int32_t reloc_val = 0x3FFFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x400000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x400000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x400001;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S12()
+{
+ int32_t reloc_val = 0x1FFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x2000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x2000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x2001;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S10()
+{
+ int32_t reloc_val = 0x7FC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x800;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x800;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x801;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S7()
+{
+ int32_t reloc_val = 0xFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x101;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_S16()
+{
+ int32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_ABS_S16()
+{
+ int32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_B()
+{
+ uint32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_H()
+{
+ uint32_t reloc_val = 0xFFFE;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0xFFFF;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_W()
+{
+ uint32_t reloc_val = 0x1FFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x1FFFD;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_DSBT_INDEX()
+{
+ uint32_t reloc_val = 0x1FFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x1FFFD;
+
+ rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h
new file mode 100644
index 0000000..67a437d
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h
@@ -0,0 +1,101 @@
+/*
+* test_c60_reloc.h
+*
+* Specification of C6x-specific relocation handler unit tests.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef _TEST_C60_RELOC_H_
+#define _TEST_C60_RELOC_H_
+#include "c60_elf32.h"
+#include <cxxtest/TestSuite.h>
+
+extern "C"
+{
+extern void unit_c60_reloc_do(C60_RELOC_TYPE r_type, uint8_t* address,
+ uint32_t addend, uint32_t symval, uint32_t pc,
+ uint32_t base_pointer, int wrong_endian, int32_t dsbt_index);
+
+extern void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t* address,
+ uint32_t* addend);
+
+extern int unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value);
+
+}
+
+class C60_TestRelocDo : public CxxTest::TestSuite
+{
+ public:
+ void test_R_C6000_ABS32();
+ void test_R_C6000_ABS16();
+ void test_R_C6000_ABS8();
+ void test_R_C6000_PCR_S21();
+ void test_R_C6000_PCR_S12();
+ void test_R_C6000_PCR_S10();
+ void test_R_C6000_PCR_S7();
+ void test_R_C6000_ABS_S16();
+ void test_R_C6000_ABS_L16();
+ void test_R_C6000_ABS_H16();
+ void test_R_C6000_SBR_U15_B();
+ void test_R_C6000_SBR_U15_H();
+ void test_R_C6000_SBR_U15_W();
+ void test_R_C6000_SBR_S16();
+ void test_R_C6000_SBR_L16_B();
+ void test_R_C6000_SBR_L16_H();
+ void test_R_C6000_SBR_L16_W();
+ void test_R_C6000_SBR_H16_B();
+ void test_R_C6000_SBR_H16_H();
+ void test_R_C6000_SBR_H16_W();
+ void test_R_C6000_DSBT_INDEX();
+};
+
+class C60_TestRelOverflow : public CxxTest::TestSuite
+{
+ public:
+ void test_R_C6000_ABS16();
+ void test_R_C6000_ABS8();
+ void test_R_C6000_PCR_S21();
+ void test_R_C6000_PCR_S12();
+ void test_R_C6000_PCR_S10();
+ void test_R_C6000_PCR_S7();
+ void test_R_C6000_SBR_S16();
+ void test_R_C6000_ABS_S16();
+ void test_R_C6000_SBR_U15_B();
+ void test_R_C6000_SBR_U15_H();
+ void test_R_C6000_SBR_U15_W();
+ void test_R_C6000_DSBT_INDEX();
+};
+
+#endif /* _TEST_C60_RELOC_H_ */
diff --git a/src/core/dsp/ocl_load/CMakeLists.txt b/src/core/dsp/ocl_load/CMakeLists.txt
new file mode 100644
index 0000000..a459542
--- /dev/null
+++ b/src/core/dsp/ocl_load/CMakeLists.txt
@@ -0,0 +1,26 @@
+include_directories (.
+ C60_DLOAD_REL
+ C60_DLOAD_DYN
+ DLOAD_SYM
+ DLOAD
+ DLOAD_API
+ DLWRAPPER
+ )
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -DC60_TARGET -DLOADER_DEBUG -g -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast")
+
+set(OCL_LOAD_SRC_FILES
+ ocl_load.c
+ C60_DLOAD_REL/c60_reloc.c
+ C60_DLOAD_DYN/c60_dynamic.c
+ DLOAD_SYM/symtab.c
+ DLOAD/ArrayList.c
+ DLOAD/dload.c
+ DLOAD/elf32.c
+ DLOAD/dload_endian.c
+)
+
+add_library(oclload STATIC ${OCL_LOAD_SRC_FILES})
+
+SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+
diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.c b/src/core/dsp/ocl_load/DLOAD/ArrayList.c
new file mode 100644
index 0000000..4452bfc
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.c
@@ -0,0 +1,122 @@
+/*
+* ArrayList.c
+*
+* Array_List is a C implementation of a C++ vector class.
+*
+* This class emulates a resizable array along the lines of a C++
+* vector or Java ArrayList class in C, and uses the convention
+* of passing a pointer to the current "object" as the first
+* argument.
+*
+* Usage is defined as follows:
+*
+* Array_List obj;
+* AL_initialize(&obj, sizeof(type_name));
+*
+* ...
+*
+* type_name *ptr = (type_name*)(obj.buf);
+* for(i = 0; i < AL_size(&obj); i++)
+* do_something_to(ptr[i]);
+* type_name to_append = ...;
+* AL_append(&obj, &to_append);
+*
+* ...
+*
+* AL_destroy(&obj);
+*
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <inttypes.h>
+#include <string.h>
+#include "ArrayList.h"
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* AL_INITIALIZE() - Initialize a newly created Array_List object. */
+/*****************************************************************************/
+void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem)
+{
+ if (num_elem == 0) num_elem = 1;
+ obj->buf = DLIF_malloc(type_size * num_elem);
+ obj->type_size = type_size;
+ obj->size = 0;
+ obj->buffer_size = num_elem;
+}
+
+/*****************************************************************************/
+/* AL_APPEND() - Append an element to the end of an Array_List. */
+/*****************************************************************************/
+void AL_append(Array_List* obj, void* to_append)
+{
+ /*------------------------------------------------------------------------*/
+ /* If there is already space in the specified buffer for the new data, */
+ /* just append it to the end of the data that is already in the buffer. */
+ /*------------------------------------------------------------------------*/
+ if (obj->size < obj->buffer_size)
+ memcpy(((uint8_t*)obj->buf) + obj->type_size * ((obj->size)++), to_append,
+ obj->type_size);
+
+ /*------------------------------------------------------------------------*/
+ /* Grow the buffer if we need more space to add the new data to it. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ void* old_buffer = obj->buf;
+ obj->buffer_size *= 2;
+ obj->buf = DLIF_malloc(obj->buffer_size*obj->type_size);
+ memcpy(obj->buf,old_buffer,obj->size*obj->type_size);
+ DLIF_free(old_buffer);
+ memcpy(((uint8_t*)obj->buf) + obj->type_size *((obj->size)++), to_append,
+ obj->type_size);
+ }
+}
+
+/*****************************************************************************/
+/* AL_SIZE() - Get the number of elements in an Array_List. */
+/*****************************************************************************/
+int32_t AL_size(Array_List* obj)
+{
+ return obj->size;
+}
+
+/*****************************************************************************/
+/* AL_DESTROY() - Free up memory associated with an Array_List that is no */
+/* longer in use. */
+/*****************************************************************************/
+void AL_destroy(Array_List* obj)
+{
+ DLIF_free(obj->buf);
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.h b/src/core/dsp/ocl_load/DLOAD/ArrayList.h
new file mode 100644
index 0000000..2c03788
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.h
@@ -0,0 +1,92 @@
+/*
+* ArrayList.h
+*
+* This implementation of ArrayList is a replacement for the C++
+* vector class in C.
+*
+* This class emulates a resizable array along the lines of a C++
+* vector or Java ArrayList class in C, and uses the convention
+* of passing a pointer to the current "object" as the first
+* argument.
+*
+* Usage is defined as follows:
+*
+* Array_List obj;
+* AL_initialize(&obj, sizeof(type_name));
+*
+* ...
+*
+* type_name *ptr = (type_name*)(obj.buf);
+* for(i = 0; i < AL_size(&obj); i++)
+* do_something_to(ptr[i]);
+* type_name to_append = ...;
+* AL_append(&obj, &to_append);
+*
+* ...
+*
+* AL_destroy(&obj);
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef ARRAYLIST_H
+#define ARRAYLIST_H
+
+#include <inttypes.h>
+
+/**********************************************************************/
+/* Array_List - structure type specification. */
+/**********************************************************************/
+typedef struct
+{
+ void *buf;
+ int32_t type_size;
+ int32_t size;
+ int32_t buffer_size;
+} Array_List;
+
+/*--------------------------------------------------------------------*/
+/* Array_List Member Functions: */
+/* */
+/* AL_initialize() - Initialize a newly created Array_List object. */
+/* AL_append() - Append an element to the end of an Array_List. */
+/* AL_size() - Get number of elements in an Array_List. */
+/* AL_destroy() - Free memory associated with an Array_List that is */
+/* no longer in use. */
+/*--------------------------------------------------------------------*/
+void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem);
+void AL_append(Array_List* obj, void* to_append);
+int32_t AL_size(Array_List* obj);
+void AL_destroy(Array_List* obj);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/Queue.h b/src/core/dsp/ocl_load/DLOAD/Queue.h
new file mode 100644
index 0000000..3f85c16
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/Queue.h
@@ -0,0 +1,194 @@
+/*
+* Queue.h
+*
+* Interface to Linked List
+* ------------------------
+*
+* This is an implementation of a type-independent linked list class for C.
+* It's basically a template class, but uses macros instead so that it can
+* be compiled with a C-only compiler.
+*
+* To define a linked list class:
+* #include "Queue.h"
+* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Queue.h"
+* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier)
+* TYPE_QUEUE_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a list:
+* Class_Identifier_Queue name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* Class_Identifier_initialize_queue(&name);
+*
+* To add to the list:
+* Class_Identifier_enqueue(&name, object);
+*
+* To iterate over the list:
+* Class_Identifier_Queue_Node *it = name.front;
+* while(it) { do_something_to_(it->value); it = it->next; }
+*
+* To delete from the list:
+* If it's the first node:
+* Class_Identifier_dequeue(&name);
+* If it's not:
+* predecessor_node->next_ptr = deleted_node->next_ptr;
+* name.size--;
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef QUEUE_H
+#define QUEUE_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_QUEUE_DEFINITION() - Define structure specifications for a linked */
+/* list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_QUEUE_DEFINITION(t, t_name) \
+struct t_name##_Queue_Node_ \
+{ \
+ t value; \
+ struct t_name##_Queue_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Queue_Node_ t_name##_Queue_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Queue_Node* front_ptr; \
+ t_name##_Queue_Node* back_ptr; \
+ int32_t size; \
+} t_name##_Queue; \
+ \
+extern void t_name##_initialize_queue(t_name##_Queue* queue); \
+extern void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue); \
+extern t t_name##_dequeue(t_name##_Queue* queue); \
+extern void t_name##_remove(t_name##_Queue* queue, t to_remove);
+
+/*****************************************************************************/
+/* TYPE_QUEUE_INITIALIZER() - Define the initializer to initialize Queues. */
+/*****************************************************************************/
+#define TYPE_QUEUE_INITIALIZER {NULL, NULL, 0}
+
+
+/*****************************************************************************/
+/* TYPE_QUEUE_IMPLEMENTATION() - Define member functions of new linked list */
+/* "class" of t_name objects. */
+/* */
+/* <type>_initialize_queue() - clears the queue */
+/* <type>_enqueue() - adds a <t> type object to the end of the queue */
+/* <type>_dequeue() - remove a <t> type object from the front of the queue */
+/* and provide access to it to the caller */
+/* <type>_remove() - find and remove a <t> type object from the queue */
+/*****************************************************************************/
+#define TYPE_QUEUE_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_queue (t_name##_Queue* queue) \
+{ \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ queue->size = 0; \
+} \
+void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue) \
+{ \
+ queue->size++; \
+ \
+ if(!queue->back_ptr) \
+ queue->back_ptr = queue->front_ptr = \
+ (t_name##_Queue_Node*) \
+ (DLIF_malloc(sizeof(t_name##_Queue_Node))); \
+ else \
+ { \
+ queue->back_ptr->next_ptr = \
+ (t_name##_Queue_Node*)(DLIF_malloc( \
+ sizeof(t_name##_Queue_Node))); \
+ queue->back_ptr = queue->back_ptr->next_ptr; \
+ } \
+ \
+ queue->back_ptr->value = to_enqueue; \
+ queue->back_ptr->next_ptr = NULL; \
+} \
+ \
+t t_name##_dequeue(t_name##_Queue* queue) \
+{ \
+ t to_ret; \
+ t_name##_Queue_Node* next_ptr = NULL; \
+ \
+ if (!queue->size) return (t) NULL; \
+ \
+ next_ptr = queue->front_ptr->next_ptr; \
+ queue->size--; \
+ to_ret = queue->front_ptr->value; \
+ DLIF_free((void*)(queue->front_ptr)); \
+ \
+ if(!queue->size) \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ else \
+ queue->front_ptr = next_ptr; \
+ \
+ return to_ret; \
+} \
+ \
+void t_name##_remove(t_name##_Queue* queue, t to_remove) \
+{ \
+ t_name##_Queue_Node* prev_ptr = NULL; \
+ t_name##_Queue_Node* curr_ptr = queue->front_ptr; \
+ t_name##_Queue_Node* next_ptr = NULL; \
+ \
+ for (; curr_ptr; curr_ptr = next_ptr) \
+ { \
+ next_ptr = curr_ptr->next_ptr; \
+ if (curr_ptr->value == to_remove) break; \
+ prev_ptr = curr_ptr; \
+ } \
+ \
+ if (curr_ptr) \
+ { \
+ if (prev_ptr) prev_ptr->next_ptr = next_ptr; \
+ queue->size--; \
+ DLIF_free((void*)(curr_ptr)); \
+ } \
+ \
+ if (!queue->size) \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ else \
+ { \
+ if (!prev_ptr) queue->front_ptr = next_ptr; \
+ if (!next_ptr) queue->back_ptr = prev_ptr; \
+ } \
+}
+
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/Stack.h b/src/core/dsp/ocl_load/DLOAD/Stack.h
new file mode 100644
index 0000000..d36f5e0
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/Stack.h
@@ -0,0 +1,155 @@
+/*
+* Stack.h
+*
+* Interface to Stack
+* ------------------
+*
+* This is an implementation of a type-independent stack implemented as
+* a signly linked list class for C. It's basically a template class, but
+* uses macros instead, so that it can be compiled with a C-only compiler.
+*
+* To define a Stack class:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a stack:
+* struct Class_Identifier_Stack name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* initialize_stack_Class_Identifier(&name);
+*
+* To add to the stack:
+* push_Class_Identifier(&name, object);
+*
+* To access the top of the stack:
+* Class_Identifier_Stack_Node *tos = name.top_ptr;
+* do_something_to_(tos->value);
+*
+* To delete from the stack:
+* if (name.size > 0) pop_Class_Identifier(&name);
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef STACK_H
+#define STACK_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */
+/* first-out linked list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_STACK_DEFINITION(t, t_name) \
+struct t_name##_Stack_Node_ \
+{ \
+ t value; \
+ struct t_name##_Stack_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Stack_Node* top_ptr; \
+ t_name##_Stack_Node* bottom_ptr; \
+ int size; \
+} t_name##_Stack; \
+ \
+extern void t_name##_initialize_stack(t_name##_Stack* stack); \
+extern void t_name##_push(t_name##_Stack* stack, t to_push); \
+extern t t_name##_pop(t_name##_Stack* stack);
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */
+/*****************************************************************************/
+#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 }
+
+/*****************************************************************************/
+/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */
+/* list "class" of t_name objects. */
+/* */
+/* <type>_initialize_stack() - clears the stack */
+/* <type>_push() - pushes a <t> type object to the top of the stack */
+/* <type>_pop() - pop a <t> type object from the top of the stack */
+/* and provide access to it to the caller */
+/*****************************************************************************/
+#define TYPE_STACK_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_stack (t_name##_Stack* stack) \
+{ \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ stack->size = 0; \
+} \
+void t_name##_push(t_name##_Stack* stack, t to_push) \
+{ \
+ stack->size++; \
+ \
+ if(!stack->top_ptr) \
+ { \
+ stack->bottom_ptr = stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = NULL; \
+ } \
+ else \
+ { \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr; \
+ stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = next_ptr; \
+ } \
+ \
+ stack->top_ptr->value = to_push; \
+} \
+ \
+t t_name##_pop(t_name##_Stack* stack) \
+{ \
+ t to_ret; \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \
+ \
+ stack->size--; \
+ to_ret = stack->top_ptr->value; \
+ DLIF_free((void*)(stack->top_ptr)); \
+ \
+ if(!stack->size) \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ else \
+ stack->top_ptr = next_ptr; \
+ \
+ return to_ret; \
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/dload.c b/src/core/dsp/ocl_load/DLOAD/dload.c
new file mode 100644
index 0000000..e5924d8
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload.c
@@ -0,0 +1,3534 @@
+/*
+* dload.c
+*
+* Core Dynamic Loader Reference Implementation
+*
+* This implementation of the core dynamic loader is platform independent,
+* but it is object file format dependent. In particular, this
+* implementation supports ELF object file format.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <limits.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "ArrayList.h"
+#include "Queue.h"
+#include "Stack.h"
+
+#include "symtab.h"
+#include "dload_endian.h"
+#include "elf32.h"
+#include "dload.h"
+#include "relocate.h"
+#include "dload_api.h"
+
+#ifdef ARM_TARGET
+#include "arm_dynamic.h"
+#endif
+
+#ifdef C60_TARGET
+#include "c60_dynamic.h"
+#endif
+
+#include "virtual_targets.h"
+
+/*---------------------------------------------------------------------------*/
+/* These globals are used only to test the reference client implementation. */
+/*---------------------------------------------------------------------------*/
+int global_argc;
+char **global_argv;
+
+/*---------------------------------------------------------------------------*/
+/* Contains filenames (type const char*) the system is in the process of */
+/* loading. Used to detect cycles in incorrectly compiled ELF binaries. */
+/*---------------------------------------------------------------------------*/
+Array_List DLIMP_module_dependency_list;
+
+/*---------------------------------------------------------------------------*/
+/* Contains objects (type DLIMP_Loaded_Module) that the system has loaded into */
+/* target memory. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_IMPLEMENTATION(DLIMP_Loaded_Module*, loaded_module_ptr)
+loaded_module_ptr_Queue DLIMP_loaded_objects = TYPE_QUEUE_INITIALIZER;
+
+/*---------------------------------------------------------------------------*/
+/* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded */
+/* when client asks to load a dynamic executable or library. Note that */
+/* dependents that have already been loaded with another module will not */
+/* appear on this queue. */
+/*---------------------------------------------------------------------------*/
+TYPE_STACK_IMPLEMENTATION(DLIMP_Dynamic_Module*, dynamic_module_ptr)
+dynamic_module_ptr_Stack DLIMP_dependency_stack = TYPE_STACK_INITIALIZER;
+
+/*---------------------------------------------------------------------------*/
+/* Current virtual target set after reading the file headers. This is used */
+/* to access target specific functions. */
+/*---------------------------------------------------------------------------*/
+VIRTUAL_TARGET *cur_target = NULL;
+
+/*---------------------------------------------------------------------------*/
+/* Support for profiling performance of dynamic loader core. */
+/*---------------------------------------------------------------------------*/
+#if LOADER_DEBUG
+static clock_t cycle0 = 0;
+static clock_t cycle_end = 0;
+#define profile_start_clock() (cycle0 = clock())
+#define profile_stop_clock() (cycle_end = clock())
+#define profile_cycle_count() (cycle_end - cycle0)
+#endif
+
+/*---------------------------------------------------------------------------*/
+/* The dynamic loader will now create a table TI_init_table to store */
+/* pre-init and init data. This is done because pre-init and */
+/* init functions could reference as-yet unrelocated symbols from other */
+/* modules. As such it is safer to store relevant function addresses and */
+/* execute them only after all modules are relocated. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_IMPLEMENTATION(IF_single_record*, IF_table)
+IF_table_Queue TI_init_table = TYPE_QUEUE_INITIALIZER;
+
+static VIRTUAL_TARGET *get_vt_obj(int given_id);
+static void read_args_from_section(DLIMP_Loaded_Module* ep_module);
+static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz);
+static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle,
+ int argc, char** argv,
+ DLIMP_Loaded_Module *ep_module);
+
+/*****************************************************************************/
+/* DLOAD_create() */
+/* */
+/* Create an instance of the dynamic loader core. */
+/* */
+/* client_handle: Private client token to be returned during select DLIF */
+/* function calls. */
+/* */
+/* returns: an opaque DLOAD core loader handle, identifying this instance.*/
+/* */
+/*****************************************************************************/
+DLOAD_HANDLE DLOAD_create(void *client_handle)
+{
+ LOADER_OBJECT *pLoaderObject = DLIF_malloc(sizeof(LOADER_OBJECT));
+
+ /*-----------------------------------------------------------------------*/
+ /* Fill out the Loader Object: */
+ /*-----------------------------------------------------------------------*/
+ /* Set up initial objects_loading queue. */
+ /*-----------------------------------------------------------------------*/
+ AL_initialize(&(pLoaderObject->DLIMP_module_dependency_list),
+ sizeof (const char*), 1);
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize Loaded Module Ptr Queue */
+ /*-----------------------------------------------------------------------*/
+ loaded_module_ptr_initialize_queue(&pLoaderObject->DLIMP_loaded_objects);
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize Dynamic Module Ptr Stack */
+ /*-----------------------------------------------------------------------*/
+ dynamic_module_ptr_initialize_stack(&pLoaderObject->DLIMP_dependency_stack);
+
+ pLoaderObject->file_handle = 1;
+
+ /*-----------------------------------------------------------------------*/
+ /* Store client token, so it can be handed back during DLIF calls */
+ /*-----------------------------------------------------------------------*/
+ pLoaderObject->client_handle = client_handle;
+
+ return((DLOAD_HANDLE)pLoaderObject);
+}
+
+/*****************************************************************************/
+/* DLOAD_destroy() */
+/* */
+/* Remove an instance of the dynamic loader core, and free all resources */
+/* allocated during DLOAD_create(). */
+/* */
+/* client_handle: Private client token to be returned during select DLIF */
+/* function calls. */
+/* Preconditions: 1) handle must be valid. */
+/* 2) Loader instance must be in "UNLOADED" state. */
+/* */
+/*****************************************************************************/
+void DLOAD_destroy(DLOAD_HANDLE handle)
+{
+ LOADER_OBJECT * pLoaderObject;
+
+ pLoaderObject = (LOADER_OBJECT *)handle;
+ AL_destroy(&(pLoaderObject->DLIMP_module_dependency_list));
+
+ /*--------------------------*/
+ /* Free the instance object */
+ /*--------------------------*/
+ DLIF_free (pLoaderObject);
+}
+
+/*****************************************************************************/
+/* DLIMP_get_first_dyntag() */
+/* */
+/* Return value for first tag entry in the given dynamic table whose */
+/* tag type matches the given key. */
+/* */
+/*****************************************************************************/
+uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through dynamic segment looking for a specific dynamic tag. */
+ /* Return the value associated with the tag, if the tag is found. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Dyn *dtp = dyn_table;
+
+ while (dtp->d_tag != DT_NULL)
+ {
+ if (dtp->d_tag == tag) return dtp->d_un.d_val;
+ else dtp++;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Tag wasn't found, return a known bogus value for the tag. */
+ /*------------------------------------------------------------------------*/
+ return INT_MAX;
+}
+
+/*****************************************************************************/
+/* dload_and_allocate_dependencies() */
+/* */
+/* If not already loaded, load each dependent file identified in the */
+/* dynamic segment with a DT_NEEDED tag. Dependent files are listed in */
+/* order and should be loaded in the same order that they appear in the */
+/* dynamic segment. */
+/* */
+/*****************************************************************************/
+static BOOL dload_and_allocate_dependencies( DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through each dynamic tag entry in the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Dyn* dyn_nugget = dyn_module->dyntab;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Starting dload_and_allocate_dependencies() for %s ...\n",
+ dyn_module->name);
+#endif
+
+ while(dyn_nugget->d_tag != DT_NULL)
+ {
+ /*---------------------------------------------------------------------*/
+ /* For each DT_NEEDED dynamic tag that we find in the dynamic segment, */
+ /* load the dependent file identified by the so_name value attached */
+ /* to the DT_NEEDED dynamic tag. */
+ /*---------------------------------------------------------------------*/
+ if (dyn_nugget->d_tag == DT_NEEDED)
+ {
+ loaded_module_ptr_Queue_Node* ptr;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found DT_NEEDED: %s\n",
+ dyn_module->strtab+dyn_nugget->d_un.d_val);
+#endif
+
+ /*------------------------------------------------------------------*/
+ /* Find out if the file named by the DT_NEEDED tag has already */
+ /* been loaded. If it has, then we only have to bump the use count */
+ /* of the named dependent file. */
+ /*------------------------------------------------------------------*/
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+
+
+ if (!strcmp(ptr->value->name,
+ dyn_module->strtab + dyn_nugget->d_un.d_val))
+ {
+ ptr->value->use_count++;
+ AL_append(&(dyn_module->loaded_module->dependencies),
+ &(ptr->value->file_handle));
+ break;
+ }
+ }
+
+ /*------------------------------------------------------------------*/
+ /* If the named dependent file has not been loaded, then we ask the */
+ /* client to invoke a load of the dependent file on our behalf. */
+ /*------------------------------------------------------------------*/
+ if (ptr == NULL)
+ {
+ int32_t dependent_handle = DLIF_load_dependent(
+ pHandle->client_handle,
+ dyn_module->strtab +
+ dyn_nugget->d_un.d_val);
+ AL_append(&(dyn_module->loaded_module->dependencies),
+ &dependent_handle);
+ if (dependent_handle == 0) return FALSE;
+ }
+ }
+
+ dyn_nugget++;
+ }
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Finished dload_and_allocate_dependencies() for %s\n",
+ dyn_module->name);
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* load_object() */
+/* */
+/* Finish the process of loading an object file. */
+/* */
+/*****************************************************************************/
+static int load_object(LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* With the dynamic loader already running on the target, we are able to */
+ /* relocate directly into target memory, so there is nothing more to be */
+ /* done (at least in the bare-metal dynamic linking ABI model). */
+ /*------------------------------------------------------------------------*/
+ return 1;
+}
+
+/*****************************************************************************/
+/* write_arguments_to_args_section() */
+/* */
+/* Write argv and argc to .args section. */
+/* */
+/*****************************************************************************/
+static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle,
+ int argc, char** argv,
+ DLIMP_Loaded_Module *ep_module)
+{
+ int mem_inc = MEM_INC;
+ int ptr_sz = PTR_SZ;
+ int p_size = ptr_sz / mem_inc;
+ int i_size = T_INTSZ / mem_inc;
+ int c_size = T_CHARSZ /mem_inc;
+ int argv_offset = 0;
+ int str_offset = 0;
+ int size = 0;
+ int arg;
+ int *targ_argv_pointers = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ uint8_t *c_args = NULL;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Write_arguments_to_args_section:\n");
+#endif
+
+ /*-----------------------------------------------------------------------*/
+ /* IF NO ARGUMENTS, ABORT QUIETLY, WITH a SUCCESSFUL CODE. */
+ /*-----------------------------------------------------------------------*/
+ if (argc == 0) return TRUE;
+
+ /*-----------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there */
+ /* is one. This is stored in the Loaded Module, and must have a */
+ /* legitimate address. If not, abort with Warning. */
+ /*-----------------------------------------------------------------------*/
+ c_args = ep_module->c_args;
+ if (!c_args || c_args == (uint8_t *)0xFFFFFFFF)
+ {
+ DLIF_warning(DLWT_MISC, "__c_args__ does not have valid value.\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* WE OUGHT TO WORRY ABOUT ALIGNMENT: IF SECTION ISN'T PROPERLY ALIGNED, */
+ /* ABORT THE PROCESSING OF ARGUMENTS WITH A NICE ERROR MESSAGE. */
+ /*-----------------------------------------------------------------------*/
+ if (c_args && ((Elf32_Addr)c_args & (MAX(p_size, i_size) - 1)))
+ {
+ DLIF_warning(DLWT_MISC, ".args section not properly aligned\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* CALCULATE OFFSET IN TABLE WHERE ARGV AND THE STRINGS WILL BE STORED. */
+ /* NOTE THAT argv MAY NEED MORE ALIGNMENT THAN AN INTEGER, SO ITS OFFSET */
+ /* IS REALLY THE MAXIMUM OF A POINTER SIZE AND INTEGER SIZE. ALSO NOTE */
+ /* WE NEED TO ALLOCATE AN EXTRA POINTER FOR argv[argc]. */
+ /*-----------------------------------------------------------------------*/
+ argv_offset = MAX(p_size, i_size);
+ str_offset = argv_offset + (argc * p_size) + p_size ;
+
+ /*-----------------------------------------------------------------------*/
+ /* CALCULATE SPACE REQUIRED FOR WRITING OUT .args SECTION. CHECK IF THE */
+ /* SEGMENT HAS ENOUGH SPACE AVAILABLE. IF NOT, RETURN WITH ERROR CODE. */
+ /*-----------------------------------------------------------------------*/
+ size = str_offset;
+
+ for (arg = 0; arg < argc; arg++)
+ size += (c_size * (strlen(argv[arg]) + 1));
+
+ if (!seg_has_space_for_write(ep_module, size))
+ {
+ DLIF_warning(DLWT_MISC,
+ "Segment has insufficient space for .args contents\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* OVERALL, WE NEED TO CREATE A TARGET IMAGE THAT CORRESPONDS TO: */
+ /* int argc; */
+ /* char *argv[argc]; */
+ /* <strings pointed to by argv> */
+ /* So say, for C6x, for "-v -d", we would need 22 bytes: */
+ /* 4 bytes // argc */
+ /* 4 bytes // argv[0] pointer value */
+ /* 4 bytes // argv[1] pointer value */
+ /* 4 bytes // argv[argc] end of pointer value array, normally 0 */
+ /* 3 bytes // "-v" */
+ /* 3 bytes // "-d" */
+ /*-----------------------------------------------------------------------*/
+
+ /*-----------------------------------------------------------------------*/
+ /* FIRST WRITE OUT ARGC. */
+ /*-----------------------------------------------------------------------*/
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ i_size, (uint32_t) &argc, (uint32_t) c_args);
+#endif
+
+ DLIF_memcpy(pHandle->client_handle, c_args, &argc, i_size);
+
+ /*-----------------------------------------------------------------------*/
+ /* CREATE AN INTERNAL ARRAY OF ARGV POINTER VALUES, THEN WRITE THEM OUT */
+ /*-----------------------------------------------------------------------*/
+ targ_argv_pointers = (int *)DLIF_malloc((argc + 1) * sizeof(int));
+ for (arg = 0; arg < argc ; arg++)
+ {
+ targ_argv_pointers[arg] = (int)(str_offset + c_args);
+ str_offset += (strlen(argv[arg]) + 1) * c_size;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("\t\ttarg_argv_pointers[%d] : 0x%x\n",
+ arg, targ_argv_pointers[arg]);
+#endif
+ }
+
+ targ_argv_pointers[argc] = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* WRITE OUT THIS INTERNAL ARRAY OF ARGV POINTER VALUES */
+ /*-----------------------------------------------------------------------*/
+ for (arg = 0; arg <= argc; arg++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ p_size, (uint32_t) &targ_argv_pointers[arg],
+ (uint32_t) (c_args + argv_offset));
+#endif
+ DLIF_memcpy(pHandle->client_handle,
+ (void *)(c_args + argv_offset),
+ &targ_argv_pointers[arg],
+ p_size);
+ argv_offset += p_size;
+ }
+
+#if LOADER_DEBUG
+if (debugging_on)
+{
+ DLIF_trace ("\t\targv being copied : 0x%x\n",(uint32_t)argv);
+ for (arg = 0; arg < argc; arg++)
+ {
+ DLIF_trace ("\t\t---\n\t\t&argv[%d] being copied : 0x%x\n", arg,
+ (uint32_t)&argv[arg]);
+ DLIF_trace ("\t\targv[%d] being copied : 0x%x\n",arg,
+ (uint32_t)argv[arg]);
+ DLIF_trace ("\t\targv[%d] being copied : %s\n",arg, (char *)argv[arg]);
+ }
+}
+#endif
+
+ /*-----------------------------------------------------------------------*/
+ /* LASTLY WRITE OUT ALL THE STRINGS. */
+ /*-----------------------------------------------------------------------*/
+ for (arg = 0; arg < argc; arg++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ (uint32_t)strlen(argv[arg]) + 1,
+ (uint32_t)&argv[arg],
+ (uint32_t)(targ_argv_pointers[arg]));
+#endif
+ DLIF_memcpy(pHandle->client_handle,
+ (void *)(targ_argv_pointers[arg]),
+ argv[arg],
+ strlen(argv[arg]) + 1);
+ }
+
+ return TRUE;
+}
+
+
+/*****************************************************************************/
+/* initialize_loaded_module() */
+/* */
+/* Initialize DLIMP_Loaded_Module internal data object associated with a */
+/* dynamic module. This function will also set up a queue of */
+/* DLIMP_Loaded_Segment(s) associated with the loaded module. */
+/* This function is called as we are getting ready to actually load the */
+/* object file contents into target memory. Each segment will get a */
+/* target memory request that it can use to ask the client for target */
+/* memory space. This function will also assign a file handle to the */
+/* loaded module. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* In applications that use the DSBT model, this function will also need to */
+/* negotiate the module's DSBT index with the client. */
+/* */
+/*****************************************************************************/
+static void initialize_loaded_module(DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Allocate a DLIMP_Loaded_Module data structure for the specified ELF */
+ /* file and assign a file handle for it (bumping the file handle counter */
+ /* as we go). */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *loaded_module =
+ dyn_module->loaded_module = DLIF_malloc(sizeof(DLIMP_Loaded_Module));
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Start clock on initialization of loaded module object. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Starting initialize_loaded_module() ...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ if (dyn_module->name)
+ {
+ loaded_module->name = DLIF_malloc(strlen(dyn_module->name) + 1);
+ strcpy(loaded_module->name, dyn_module->name);
+ }
+ else
+ loaded_module->name = "<unknown>";
+
+ loaded_module->file_handle = pHandle->file_handle++;
+ loaded_module->direct_dependent_only = dyn_module->direct_dependent_only;
+ loaded_module->use_count = 1;
+
+ /*------------------------------------------------------------------------*/
+ /* In case we wrapped around the file handle, return error. */
+ /*------------------------------------------------------------------------*/
+ if (pHandle->file_handle == 0)
+ DLIF_error(DLET_MISC, "DLOAD File handle overflowed.\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Initially the loaded module does not have access to its global */
+ /* symbols. These need to be copied from the dynamic module (see call */
+ /* to DLSYM_copy_globals() below). */
+ /* */
+ /* THESE INITIALIZATIONS SHOULD BE MOVED TO AN INIT ROUTINE FOR THE */
+ /* LOADED MODULE */
+ /*------------------------------------------------------------------------*/
+ loaded_module->gsymtab = NULL;
+ loaded_module->gstrtab = NULL;
+ loaded_module->gsymnum = loaded_module->gstrsz = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the Array_List of dependencies. */
+ /*------------------------------------------------------------------------*/
+ AL_initialize(&(loaded_module->dependencies), sizeof(int), 1);
+
+ if (dyn_module->symtab)
+ DLSYM_copy_globals(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the module loaded segments Array_List. */
+ /*------------------------------------------------------------------------*/
+ AL_initialize(&(loaded_module->loaded_segments),
+ sizeof(DLIMP_Loaded_Segment), dyn_module->phnum);
+
+ /*------------------------------------------------------------------------*/
+ /* Spin thru segment headers and process each load segment encountered. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->phnum; i++)
+ if (dyn_module->phdr[i].p_type == PT_LOAD)
+ {
+ /*------------------------------------------------------------------*/
+ /* Note that this is parallel to and does not supplant the ELF */
+ /* phdr tables. */
+ /*------------------------------------------------------------------*/
+ DLIMP_Loaded_Segment seg;
+ seg.obj_desc = DLIF_malloc(sizeof(struct DLOAD_MEMORY_SEGMENT));
+ seg.phdr.p_vaddr = dyn_module->phdr[i].p_vaddr;
+ seg.phdr.p_offset = dyn_module->phdr[i].p_offset;
+ seg.obj_desc->target_page = 0; /*not used*/
+ seg.modified = 0;
+ seg.phdr.p_filesz = seg.obj_desc->objsz_in_bytes
+ = dyn_module->phdr[i].p_filesz;
+ seg.phdr.p_memsz = seg.obj_desc->memsz_in_bytes
+ = dyn_module->phdr[i].p_memsz;
+ seg.phdr.p_align = dyn_module->phdr[i].p_align;
+ seg.phdr.p_flags = dyn_module->phdr[i].p_flags;
+ AL_append(&(loaded_module->loaded_segments), &seg);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the DSO termination information for this module. */
+ /* It will be copied over from the enclosing dyn_module object when */
+ /* placement is completed and dyn_module's local copy of the dynamic */
+ /* table is updated. */
+ /*------------------------------------------------------------------------*/
+ loaded_module->fini_array = (Elf32_Addr) NULL;
+ loaded_module->fini_arraysz = 0;
+ loaded_module->fini = (Elf32_Addr) NULL;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished initialize_loaded_module()\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long)profile_cycle_count());
+ }
+ }
+#endif
+
+}
+
+/*****************************************************************************/
+/* load_static_segment() */
+/* */
+/* The core dynamic loader requires that a statically linked executable */
+/* be placed in target memory at the location that was determined during */
+/* the static link that created the executable. Failure to get the */
+/* required target memory where the static executable is to be loaded */
+/* will cause the dynamic loader to emit an error and abort the load. */
+/* */
+/*****************************************************************************/
+static BOOL load_static_segment(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*)
+ (dyn_module->loaded_module->loaded_segments.buf);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* For each segment in the loaded module, build up a target memory */
+ /* request for the segment, get rights to target memory where we want */
+ /* to load the segment from the client, then get the client to write the */
+ /* segment contents out to target memory to the appropriate address. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ seg[i].obj_desc->target_page = 0;
+ targ_req.flags = 0;
+
+ /*---------------------------------------------------------------------*/
+ /* This is a static executable. DLIF_allocate should give us the */
+ /* address we ask for or fail. */
+ /*---------------------------------------------------------------------*/
+ if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+
+ targ_req.align = seg[i].phdr.p_align;
+ seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr;
+ targ_req.flags &= ~DLOAD_SF_relocatable;
+ targ_req.fp = fd;
+ targ_req.segment = seg[i].obj_desc;
+ targ_req.offset = seg[i].phdr.p_offset;
+ targ_req.flip_endian = dyn_module->wrong_endian;
+
+ /*---------------------------------------------------------------------*/
+ /* Ask the client side of the dynamic loader to allocate target memory */
+ /* for this segment to be loaded into. */
+ /*---------------------------------------------------------------------*/
+ if (!DLIF_allocate(pHandle->client_handle, &targ_req)) return FALSE;
+
+ /*---------------------------------------------------------------------*/
+ /* If there is any initialized data in the segment, we'll first write */
+ /* it into a host writable buffer (DLIF_copy()) and then flush it to */
+ /* target memory. */
+ /*---------------------------------------------------------------------*/
+ if (seg[i].phdr.p_filesz)
+ {
+ DLIF_copy(pHandle->client_handle, &targ_req);
+ DLIF_write(pHandle->client_handle, &targ_req);
+ }
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* relocate_target_dynamic_tag_info() */
+/* */
+/* Update a target specific dynamic tag value that happens to be a */
+/* virtual address of a section. Returns TRUE if the tag was updated or */
+/* is not a virtual address and FALSE if it was not successfully updated */
+/* or was not recognized. */
+/*****************************************************************************/
+static BOOL relocate_target_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module,
+ int i)
+{
+ return cur_target->relocate_dynamic_tag_info(dyn_module, i);
+}
+
+/*****************************************************************************/
+/* DLIMP_update_dyntag_section_address() */
+/* */
+/* Given the index of a dynamic tag which we happen to know points to a */
+/* section address, find the program header table entry associated with */
+/* the specified address and update the tag value with the real address */
+/* of the section. */
+/* */
+/*****************************************************************************/
+BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i)
+{
+ int j;
+ DLIMP_Loaded_Segment *seg = (DLIMP_Loaded_Segment *)
+ (dyn_module->loaded_module->loaded_segments.buf);
+
+ /*------------------------------------------------------------------------*/
+ /* If dynamic tag does not access an existing section, then no update */
+ /* is required. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->dyntab[i].d_un.d_ptr == (Elf32_Addr)0)
+ { return TRUE; }
+
+ for (j = 0; j < dyn_module->loaded_module->loaded_segments.size; j++)
+ {
+ if ((dyn_module->dyntab[i].d_un.d_ptr >= seg[j].input_vaddr) &&
+ (dyn_module->dyntab[i].d_un.d_ptr <
+ (seg[j].input_vaddr + seg[j].phdr.p_memsz)))
+ {
+ dyn_module->dyntab[i].d_un.d_ptr +=
+ (seg[j].phdr.p_vaddr - seg[j].input_vaddr);
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* relocate_dynamic_tag_info() */
+/* */
+/* Once segment allocation has been completed, we'll need to go through */
+/* the dynamic table and update any tag values that happen to be virtual */
+/* addresses of segments (DT_C6000_DSBT_BASE, for example). */
+/* */
+/*****************************************************************************/
+static BOOL relocate_dynamic_tag_info(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through dynamic table loking for tags that have a value which is */
+ /* the virtual address of a section. After the sections are allocated, */
+ /* we'll need to update these values with the new address of the section. */
+ /*------------------------------------------------------------------------*/
+ int i;
+ for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++)
+ {
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* Only tag values that are virtual addresses will be affected. */
+ /*------------------------------------------------------------------*/
+ case DT_NEEDED:
+ case DT_PLTRELSZ:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_RELASZ:
+ case DT_RELAENT:
+ case DT_STRSZ:
+ case DT_SYMENT:
+ case DT_SONAME:
+ case DT_RPATH:
+ case DT_SYMBOLIC:
+ case DT_REL:
+ case DT_RELSZ:
+ case DT_RELENT:
+ case DT_PLTREL:
+ case DT_DEBUG:
+ case DT_TEXTREL:
+ case DT_BIND_NOW:
+ case DT_INIT_ARRAYSZ:
+ case DT_RUNPATH:
+ case DT_FLAGS:
+ case DT_PREINIT_ARRAYSZ:
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* NOTE!!! */
+ /* case DT_ENCODING: -- tag type has same "id" as DT_PREINIT_ARRAY */
+ /*------------------------------------------------------------------*/
+
+ /*------------------------------------------------------------------*/
+ /* This is a generic dynamic tag whose value is a virtual address */
+ /* of a section. It needs to be relocated to the section's actual */
+ /* address in target memory. */
+ /*------------------------------------------------------------------*/
+ case DT_PREINIT_ARRAY:
+ case DT_INIT:
+ case DT_INIT_ARRAY:
+ if (!DLIMP_update_dyntag_section_address(dyn_module, i))
+ return FALSE;
+
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Once we have resolved the actual address of termination function */
+ /* sections, we need to copy their addresses over to the loaded */
+ /* module object (dyn_module will be deleted before we get to */
+ /* unloading the module). */
+ /*------------------------------------------------------------------*/
+ case DT_FINI_ARRAY:
+ case DT_FINI:
+ if (!DLIMP_update_dyntag_section_address(dyn_module, i))
+ return FALSE;
+
+ if (dyn_module->dyntab[i].d_tag == DT_FINI)
+ dyn_module->loaded_module->fini =
+ dyn_module->dyntab[i].d_un.d_ptr;
+ else
+ dyn_module->loaded_module->fini_array =
+ dyn_module->dyntab[i].d_un.d_ptr;
+
+ continue;
+
+ case DT_FINI_ARRAYSZ:
+ dyn_module->loaded_module->fini_arraysz =
+ dyn_module->dyntab[i].d_un.d_val;
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Is this a virtual address??? */
+ /*------------------------------------------------------------------*/
+ case DT_JMPREL: /* is this a virtual address??? */
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* The remaining dynamic tag types should be target specific. If */
+ /* something generic slips through to here, then the handler for */
+ /* relocating target specific dynamic tags should fail. */
+ /*------------------------------------------------------------------*/
+ default:
+ if (!relocate_target_dynamic_tag_info(dyn_module, i))
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We've gotten through all of the dynamic table without incident. */
+ /* All dynamic tag values that were virtual section addresses should have */
+ /* been updated with the final address of the section that they point to. */
+ /*------------------------------------------------------------------------*/
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* allocate_dynamic_segments_and relocate_symbols() */
+/* */
+/* Allocate target memory for each segment in this module, getting a */
+/* host-accessible space to copy the content of each segment into. Then */
+/* update the symbol table and program header table to reflect the new */
+/* target address for each segment. Processing of the dynamic relocation */
+/* entries will wait until all dependent files have been loaded and */
+/* allocated into target memory. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* The relocation entries in the ELF file do not handle the necessary */
+/* adjustments to the memory addresses in the program header or symbol */
+/* tables. These must be done manually. */
+/* */
+/* This is harder for us than for most dynamic loaders, because we have to */
+/* work in environments without virtual memory and thus where the offsets */
+/* between segments in memory may be different than they were in the file. */
+/* So, even though a dynamic loader usually only has to adjust all the */
+/* segments by a single fixed offset, we need to offset the symbols and */
+/* program header addresses segment by segment. This job is done by the */
+/* function below. */
+/* */
+/*****************************************************************************/
+static BOOL allocate_dynamic_segments_and_relocate_symbols
+ (DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i,j;
+ DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*)
+ (dyn_module->loaded_module->loaded_segments.buf);
+ struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Dynamic executable found.\n"
+ "Starting allocate_dynamic_segments_and_relocate_symbols()"
+ "...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through the list of loaded segments from the current module. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++)
+ {
+ /*--------------------------------------------------------------------*/
+ /* Allocate target memory for segment via client-provided target */
+ /* memory API. */
+ /*--------------------------------------------------------------------*/
+ int32_t addr_offset;
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ seg[i].obj_desc->target_page = 0;
+ targ_req.flags = 0;
+ if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = 0x20;
+ seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr;
+ targ_req.flags |= DLOAD_SF_relocatable;
+ targ_req.fp = fd;
+ targ_req.segment = seg[i].obj_desc;
+ targ_req.offset = seg[i].phdr.p_offset;
+ targ_req.flip_endian = dyn_module->wrong_endian;
+
+ if (!DLIF_allocate(pHandle->client_handle, &targ_req))
+ {
+ DLIF_error(DLET_MEMORY, "DLIF allocation failure.\n");
+ return FALSE;
+ }
+
+ /*--------------------------------------------------------------------*/
+ /* Calculate the offset we need to adjust segment header and symbol */
+ /* table addresses. */
+ /*--------------------------------------------------------------------*/
+ addr_offset = (int32_t)(seg[i].obj_desc->target_address) -
+ (int32_t)(seg[i].phdr.p_vaddr);
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Segment %d (at 0x%x, 0x%x bytes) relocated to 0x%x\n", i,
+ (int32_t)(seg[i].phdr.p_vaddr),
+ (int32_t)(seg[i].phdr.p_memsz),
+ (int32_t)(seg[i].obj_desc->target_address));
+ DLIF_trace("Addr Offset is 0x%x\n", addr_offset);
+ }
+#endif
+
+ /*--------------------------------------------------------------------*/
+ /* Update program entry point if needed. Need to replace to deal */
+ /* with full ELF initialization routine. */
+ /*--------------------------------------------------------------------*/
+ if (dyn_module->relocate_entry_point &&
+ fhdr->e_entry >= (Elf32_Addr)(seg[i].phdr.p_vaddr) &&
+ fhdr->e_entry <
+ (Elf32_Addr)((uint8_t*)(seg[i].phdr.p_vaddr) +
+ (uint32_t)(seg[i].phdr.p_memsz)))
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Entry point 0x%x relocated to 0x%x\n",
+ fhdr->e_entry, fhdr->e_entry + addr_offset);
+ }
+#endif
+ fhdr->e_entry += addr_offset;
+
+ /*------------------------------------------------------------------*/
+ /* Mark the entry point as being relocated so we will not do it */
+ /* again. */
+ /*------------------------------------------------------------------*/
+ dyn_module->relocate_entry_point = FALSE;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Fix program header entries in segment and Elf32_Phdr structs. */
+ /*---------------------------------------------------------------------*/
+ for (j = 0; j < fhdr->e_phnum; j++)
+ if (dyn_module->phdr[j].p_vaddr == (Elf32_Addr)seg[i].phdr.p_vaddr)
+ {
+ dyn_module->phdr[j].p_vaddr += addr_offset;
+ dyn_module->phdr[i].p_paddr += addr_offset;
+ break;
+ }
+
+ seg[i].input_vaddr = (Elf32_Addr)(seg[i].phdr.p_vaddr);
+ seg[i].phdr.p_vaddr += addr_offset;
+
+ /*---------------------------------------------------------------------*/
+ /* Great, now the hard part: fix offsets in symbols. It would be nice */
+ /* if there were an easier way to deal with this. */
+ /*---------------------------------------------------------------------*/
+ {
+ struct Elf32_Sym *gsymtab =
+ ((struct Elf32_Sym*)(dyn_module->loaded_module->gsymtab));
+ Elf32_Addr segment_start = (Elf32_Addr)seg[i].phdr.p_vaddr;
+ Elf32_Addr segment_end = (Elf32_Addr)seg[i].phdr.p_vaddr +
+ seg[i].phdr.p_memsz;
+ Elf32_Word global_index = dyn_module->symnum -
+ dyn_module->loaded_module->gsymnum;
+
+ for (j = 0; j < dyn_module->symnum; j++)
+ {
+ /*---------------------------------------------------------------*/
+ /* Get the relocated symbol value. */
+ /*---------------------------------------------------------------*/
+ Elf32_Addr symval_adj = dyn_module->symtab[j].st_value +
+ addr_offset;
+
+ /*---------------------------------------------------------------*/
+ /* If the symbol is defined in this segment, update the symbol */
+ /* value and mark the symbol so that we don't relocate it again. */
+ /*---------------------------------------------------------------*/
+ if (symval_adj >= segment_start && symval_adj < segment_end &&
+ dyn_module->symtab[j].st_shndx != INT16_MAX)
+ {
+ dyn_module->symtab[j].st_value = symval_adj;
+
+ /*------------------------------------------------------------*/
+ /* The module symbol table only has the global symbols. */
+ /*------------------------------------------------------------*/
+ if (j >= global_index)
+ gsymtab[j-global_index].st_value = symval_adj;
+
+ /*------------------------------------------------------------*/
+ /* Mark the symbol as relocated. */
+ /*------------------------------------------------------------*/
+ dyn_module->symtab[j].st_shndx = INT16_MAX;
+ }
+ }
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Update dynamic tag information. Some dynamic tags have values which */
+ /* are virtual addresses of sections. These values need to be updated */
+ /* once segment allocation is completed and the new segment addresses are */
+ /* known. */
+ /*------------------------------------------------------------------------*/
+ /* We should only traverse through the dynamic table once because we want */
+ /* to avoid the possibility of updating the same tag multiple times (an */
+ /* error, if it happens). */
+ /*------------------------------------------------------------------------*/
+ if (!relocate_dynamic_tag_info(fd, dyn_module))
+ {
+ DLIF_error(DLET_MISC, "Failed dynamic table update.\n");
+ return FALSE;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished allocate_dynamic_segments_and_relocate_symbols()\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n", (unsigned long) profile_cycle_count());
+ }
+ }
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* delete_DLIMP_Loaded_Module() */
+/* */
+/* Free host memory associated with a DLIMP_Loaded_Module data structure */
+/* and all of the DLIMP_Loaded_Segment objects that are associated with */
+/* it. */
+/* */
+/*****************************************************************************/
+static void delete_DLIMP_Loaded_Module(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Module **pplm)
+{
+ DLIMP_Loaded_Module *loaded_module = *pplm;
+ DLIMP_Loaded_Segment *segments = (DLIMP_Loaded_Segment*)
+ (loaded_module->loaded_segments.buf);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*-----------------------------------------------------------------------*/
+ /* Spin through the segments attached to this loaded module, freeing up */
+ /* any target memory that was allocated by the client for the segment. */
+ /*-----------------------------------------------------------------------*/
+ int i;
+ for (i = 0; i < loaded_module->loaded_segments.size; i++)
+ {
+ if (!DLIF_release(pHandle->client_handle, segments[i].obj_desc))
+ DLIF_error(DLET_MISC, "Failed call to DLIF_release!\n");;
+ DLIF_free(segments[i].obj_desc);
+ }
+
+ /*----------------------------------------------------------------------*/
+ /* Hacky way of indicating that the base image is no longer available. */
+ /* WHHHHAAAAAAATTT!?!?!?!?!?! */
+ /*----------------------------------------------------------------------*/
+ if (loaded_module->file_handle == DLIMP_application_handle)
+ DLIMP_application_handle = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* Free host heap memory that was allocated for the internal loaded */
+ /* module data structure members. */
+ /*-----------------------------------------------------------------------*/
+ if (loaded_module->name) DLIF_free(loaded_module->name);
+ if (loaded_module->gsymtab) DLIF_free(loaded_module->gsymtab);
+ loaded_module->gsymnum = 0;
+ if (loaded_module->gstrtab) DLIF_free(loaded_module->gstrtab);
+ loaded_module->gstrsz = 0;
+ AL_destroy(&(loaded_module->loaded_segments));
+ AL_destroy(&(loaded_module->dependencies));
+
+ /*-----------------------------------------------------------------------*/
+ /* Finally, free the host memory for the loaded module object, then NULL */
+ /* the pointer that was passed in. */
+ /*-----------------------------------------------------------------------*/
+ DLIF_free(loaded_module);
+ *pplm = NULL;
+}
+
+/*****************************************************************************/
+/* new_DLIMP_Dynamic_Module() */
+/* */
+/* Allocate a dynamic module data structure from host memory and */
+/* initialize its members to their default values. */
+/* */
+/*****************************************************************************/
+static DLIMP_Dynamic_Module *new_DLIMP_Dynamic_Module(LOADER_FILE_DESC *fd)
+{
+ /*-----------------------------------------------------------------------*/
+ /* Allocate space for dynamic module data structure from host memory. */
+ /*-----------------------------------------------------------------------*/
+ DLIMP_Dynamic_Module *dyn_module =
+ (DLIMP_Dynamic_Module *)DLIF_malloc(sizeof(DLIMP_Dynamic_Module));
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize data members of the new dynamic module data structure. */
+ /*-----------------------------------------------------------------------*/
+ dyn_module->name = NULL;
+ dyn_module->fd = fd;
+ dyn_module->phdr = NULL;
+ dyn_module->phnum = 0;
+ dyn_module->strtab = NULL;
+ dyn_module->strsz = 0;
+ dyn_module->dyntab = NULL;
+ dyn_module->symtab = NULL;
+ dyn_module->symnum = 0;
+ dyn_module->gsymtab_offset = 0;
+ dyn_module->gstrtab_offset = 0;
+ dyn_module->c_args = NULL;
+ dyn_module->argc = 0;
+ dyn_module->argv = NULL;
+ dyn_module->loaded_module = NULL;
+ dyn_module->wrong_endian = 0;
+ dyn_module->direct_dependent_only = TRUE;
+ dyn_module->relocatable = FALSE;
+ dyn_module->relocate_entry_point = TRUE;
+
+ dyn_module->dsbt_size = 0;
+ dyn_module->dsbt_index = DSBT_INDEX_INVALID;
+ dyn_module->dsbt_base_tagidx = -1;
+
+ dyn_module->preinit_array_idx = -1;
+ dyn_module->preinit_arraysz = 0;
+ dyn_module->init_idx = -1;
+ dyn_module->init_array_idx = -1;
+ dyn_module->init_arraysz = 0;
+
+ return dyn_module;
+}
+
+/*****************************************************************************/
+/* detach_loaded_module() */
+/* */
+/* Detach loaded module data structure from given dynamic module. When */
+/* an object file has been successfully loaded, the loader core will */
+/* detach the loaded module data structure from the dynamic module data */
+/* structure because the loaded module must continue to persist until is */
+/* is actually unloaded from target memory. If there is a problem with */
+/* the load, then the host memory associated with the loaded module will */
+/* be released as part of the destruction of the dynamic module. */
+/* */
+/*****************************************************************************/
+static
+DLIMP_Loaded_Module *detach_loaded_module(DLIMP_Dynamic_Module *dyn_module)
+{
+ if (dyn_module && dyn_module->loaded_module)
+ {
+ DLIMP_Loaded_Module *loaded_module = dyn_module->loaded_module;
+ dyn_module->loaded_module = NULL;
+ return loaded_module;
+ }
+
+ return NULL;
+}
+/*****************************************************************************/
+/* delete_DLIMP_Dynamic_Module() */
+/* */
+/* Remove local copies of the string table, symbol table, program header */
+/* table, and dynamic table. */
+/* */
+/*****************************************************************************/
+static void delete_DLIMP_Dynamic_Module(DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module **ppdm)
+{
+ DLIMP_Dynamic_Module *dyn_module = NULL;
+
+ if (!ppdm || (*ppdm == NULL))
+ {
+ DLIF_error(DLET_MISC,
+ "Internal Error: invalid argument to dynamic module "
+ "destructor function; aborting loader\n");
+ DLIF_exit(1);
+ }
+
+ dyn_module = *ppdm;
+ if (dyn_module->name) DLIF_free(dyn_module->name);
+ if (dyn_module->strtab) DLIF_free(dyn_module->strtab);
+ if (dyn_module->symtab) DLIF_free(dyn_module->symtab);
+ if (dyn_module->phdr) DLIF_free(dyn_module->phdr);
+ if (dyn_module->dyntab) DLIF_free(dyn_module->dyntab);
+
+ /*------------------------------------------------------------------------*/
+ /* If we left the loaded module attached to the dynamic module, then */
+ /* something must have gone wrong with the load. Remove the loaded */
+ /* module from the queue of loaded modules, if it is there. Then free */
+ /* the host memory allocated to the loaded module and its segments. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->loaded_module != NULL)
+ delete_DLIMP_Loaded_Module(handle, &(dyn_module->loaded_module));
+
+ /*------------------------------------------------------------------------*/
+ /* Finally, free the host memory for this dynamic module object and NULL */
+ /* the pointer to the object. */
+ /*------------------------------------------------------------------------*/
+ DLIF_free(dyn_module);
+ *ppdm = NULL;
+}
+
+/*****************************************************************************/
+/* file_header_magic_number_is_valid() */
+/* */
+/* Given an object file header, check the magic number to ensure that it */
+/* is an object file format that we recognize. This implementation of */
+/* the dynamic loader core will handle ELF object file format. */
+/* */
+/*****************************************************************************/
+static BOOL file_header_magic_number_is_valid(struct Elf32_Ehdr* header)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check for correct ELF magic numbers in file header. */
+ /*------------------------------------------------------------------------*/
+ if (!header->e_ident[EI_MAG0] == ELFMAG0 ||
+ !header->e_ident[EI_MAG1] == ELFMAG1 ||
+ !header->e_ident[EI_MAG2] == ELFMAG2 ||
+ !header->e_ident[EI_MAG3] == ELFMAG3)
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF magic number.\n");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* file_header_machine_is_valid() */
+/* */
+/* Check if the machine specified in the file header is supported by the */
+/* loader. If the loader was compiled with support for all targets, */
+/* the machine will be initially set to EM_NONE. Once a module has been */
+/* loaded, all remaining modules must have the same machine value. */
+/*****************************************************************************/
+static int file_header_machine_is_valid(Elf32_Half e_machine)
+{
+ /*------------------------------------------------------------------------*/
+ /* Currently we support only ARM or C6x */
+ /*------------------------------------------------------------------------*/
+ switch(e_machine)
+ {
+#ifdef ARM_TARGET
+ case EM_ARM : return TRUE;
+#endif
+#ifdef C60_TARGET
+ case EM_TI_C6000 : return TRUE;
+#endif
+
+ default : return FALSE;
+ }
+}
+
+/*****************************************************************************/
+/* is_valid_elf_object_file() */
+/* */
+/* Check file size against anticipated end location of string table, */
+/* symbol table, program header tables, etc. If we anything untoward, */
+/* then we declare that the ELF file is corrupt and the load is aborted. */
+/* */
+/*****************************************************************************/
+static BOOL is_valid_elf_object_file(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ uint32_t fsz;
+ int i;
+
+ /*------------------------------------------------------------------------*/
+ /* Get file size. */
+ /*------------------------------------------------------------------------*/
+ DLIF_fseek(fd, 0, LOADER_SEEK_END);
+ fsz = DLIF_ftell(fd);
+
+ /*------------------------------------------------------------------------*/
+ /* Check for invalid table sizes (string table, symbol table, and */
+ /* program header tables). */
+ /*------------------------------------------------------------------------*/
+ if (!((dyn_module->strsz < fsz) &&
+ (dyn_module->symnum < fsz) &&
+ (dyn_module->phnum * sizeof(struct Elf32_Phdr)) < fsz))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF table bounds.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for null so_name string in file with dynamic information. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->dyntab && !strcmp(dyn_module->name, ""))
+ {
+ DLIF_error(DLET_MISC, "Dynamic file lacks SO_NAME identifier.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for invalid program header information. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->phnum; i++)
+ {
+ struct Elf32_Phdr* phdr = dyn_module->phdr + i;
+
+ /*---------------------------------------------------------------------*/
+ /* Sanity check for relative sizes of filesz and memsz. */
+ /*---------------------------------------------------------------------*/
+ if (!(phdr->p_type != PT_LOAD || phdr->p_filesz <= phdr->p_memsz))
+ {
+ DLIF_error(DLET_MISC,
+ "Invalid file or memory size for segment %d.\n", i);
+ return FALSE;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Check that segment file offset doesn't go off the end of the file. */
+ /*---------------------------------------------------------------------*/
+ if (!(phdr->p_offset + phdr->p_filesz < fsz))
+ {
+ DLIF_error(DLET_FILE,
+ "File location of segment %d is past the end of file.\n", i);
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check that a ET_DYN-type file is relocatable. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->fhdr.e_type == ET_DYN && !dyn_module->symtab) return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* All checks passed. */
+ /*------------------------------------------------------------------------*/
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* process_eiosabi() */
+/* */
+/* Check the EI_OSABI field to validate it and set any parameters based on */
+/* it. */
+/*****************************************************************************/
+static BOOL process_eiosabi(DLIMP_Dynamic_Module* dyn_module)
+{
+ return cur_target->process_eiosabi(dyn_module);
+}
+
+/*****************************************************************************/
+/* dload_file_header() */
+/* */
+/* Read ELF file header. Store critical information in the provided */
+/* DLIMP_Dynamic_Module record. Check file header for validity. */
+/* */
+/*****************************************************************************/
+static BOOL dload_file_header(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read ELF file header from given input file. */
+ /*------------------------------------------------------------------------*/
+ DLIF_fread(&(dyn_module->fhdr), sizeof(struct Elf32_Ehdr), 1, fd);
+
+ /*------------------------------------------------------------------------*/
+ /* Determine target vs. host endian-ness. Does header data need to be */
+ /* byte swapped? */
+ /*------------------------------------------------------------------------*/
+ dyn_module->wrong_endian =
+ (dyn_module->fhdr.e_ident[EI_DATA] != DLIMP_get_endian());
+
+ /*------------------------------------------------------------------------*/
+ /* Swap file header structures, if needed. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ DLIMP_change_ehdr_endian(&(dyn_module->fhdr));
+
+ /*------------------------------------------------------------------------*/
+ /* Write out magic ELF information for debug purposes. */
+ /*------------------------------------------------------------------------*/
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("ELF: %c%c%c\n", dyn_module->fhdr.e_ident[1],
+ dyn_module->fhdr.e_ident[2],
+ dyn_module->fhdr.e_ident[3]);
+ DLIF_trace("ELF file header entry point: %x\n",
+ dyn_module->fhdr.e_entry);
+ }
+#endif
+
+
+ /*------------------------------------------------------------------------*/
+ /* Verify magic numbers in ELF file header. */
+ /*------------------------------------------------------------------------*/
+ if (!file_header_magic_number_is_valid(&(dyn_module->fhdr)))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file header magic number.\n");
+ return FALSE;
+ }
+
+ if (!file_header_machine_is_valid(dyn_module->fhdr.e_machine))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file target machine.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Verify file is an executable or dynamic shared object or library. */
+ /*------------------------------------------------------------------------*/
+ if ((dyn_module->fhdr.e_type != ET_EXEC) &&
+ (dyn_module->fhdr.e_type != ET_DYN))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file type.\n");
+ return FALSE;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Stop profiling clock when file header information has finished */
+ /* loading. Re-start clock on initialization of symbol table, and */
+ /* dynamic table pointers. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("done.\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long)profile_cycle_count());
+ profile_start_clock();
+ }
+ }
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* dload_program_header_table() */
+/* */
+/* Make a local copy of the ELF object file's program header table in the */
+/* dynamic module data structure. */
+/* */
+/*****************************************************************************/
+static void dload_program_header_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read the program header tables from the object file. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr);
+ dyn_module->phdr = (struct Elf32_Phdr*)
+ (DLIF_malloc(fhdr->e_phnum * fhdr->e_phentsize));
+ DLIF_fseek(fd, fhdr->e_phoff, LOADER_SEEK_SET);
+ DLIF_fread(dyn_module->phdr, fhdr->e_phentsize, fhdr->e_phnum,fd);
+ dyn_module->phnum = fhdr->e_phnum;
+
+ /*------------------------------------------------------------------------*/
+ /* Byte swap the program header tables if the target endian-ness is not */
+ /* the same as the host endian-ness. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ {
+ int i;
+ for (i = 0; i < dyn_module->phnum; i++)
+ DLIMP_change_phdr_endian(dyn_module->phdr + i);
+ }
+}
+
+/*****************************************************************************/
+/* dload_headers() */
+/* */
+/* Read ELF object file header and program header table information into */
+/* the given dynamic module data structure. If the object file contains */
+/* dynamic information, read in the dynamic tags, dynamic symbol table, */
+/* and global string table. Check to make sure that we are not already */
+/* in the process of loading the module (circular dependencies), then */
+/* perform some level of sanity checking on the content of the file to */
+/* provide some assurance that the file is not corrupted. */
+/* */
+/*****************************************************************************/
+static BOOL dload_headers(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* More progress information. Start timing if profiling is enabled. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("\nReading file headers ...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Read file header information and check vs. expected ELF object file */
+ /* header content. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_file_header(fd, dyn_module))
+ return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* Read program header table information into the dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ dload_program_header_table(fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Once headers have been read in, use e_machine to set virtual target. */
+ /* This can then be used to access target specific functions. */
+ /*------------------------------------------------------------------------*/
+ cur_target = get_vt_obj(dyn_module->fhdr.e_machine);
+ if (!cur_target)
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* find_dynamic_segment() */
+/* */
+/* Find the dynamic segment in the given ELF object file, if there is */
+/* one. If the segment is found, then the segment ID output parameter */
+/* is set to the index of the dynamic segment in the program header */
+/* table. If the dynamic segment is not found, the dynamic module's */
+/* relocatable flag is set to FALSE, and return FALSE. */
+/* */
+/*****************************************************************************/
+static BOOL find_dynamic_segment(DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word *dyn_seg_idx)
+{
+ int i;
+
+ /*------------------------------------------------------------------------*/
+ /* We should have a valid dynamic module pointer and somewhere to put the */
+ /* dynamic segment id, if we find one. If either of these are missing, */
+ /* we should get an internal error and abort the loader. */
+ /*------------------------------------------------------------------------*/
+ if ((dyn_module == NULL) || (dyn_seg_idx == NULL))
+ {
+ DLIF_error(DLET_MISC, "Internal error: find_dynamic_segment() needs "
+ "non-NULL arguments.\n");
+ DLIF_exit(1);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through segment program headers to find the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->relocatable = TRUE;
+ for (i = 0; i < dyn_module->phnum; i++)
+ if (dyn_module->phdr[i].p_type == PT_DYNAMIC)
+ { *dyn_seg_idx = i; return TRUE; }
+
+ /*------------------------------------------------------------------------*/
+ /* No dynamic segment found, mark the object module as not relocatable */
+ /* and warn the user. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->relocatable = FALSE;
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* copy_dynamic_table() */
+/* */
+/* Make a local copy of the dynamic table read from the dynamic segment */
+/* in the ELF object file. */
+/* */
+/*****************************************************************************/
+static void copy_dynamic_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word dyn_seg_idx)
+{
+ /*------------------------------------------------------------------------*/
+ /* Allocate space for the dynamic table from host memory and read its */
+ /* content from the ELF object file. */
+ /*------------------------------------------------------------------------*/
+ Elf32_Word num_elem;
+ dyn_module->dyntab = DLIF_malloc(dyn_module->phdr[dyn_seg_idx].p_filesz);
+ num_elem = dyn_module->phdr[dyn_seg_idx].p_filesz / sizeof(struct Elf32_Dyn);
+ DLIF_fseek(fd, dyn_module->phdr[dyn_seg_idx].p_offset, LOADER_SEEK_SET);
+ DLIF_fread(dyn_module->dyntab, sizeof(struct Elf32_Dyn), num_elem, fd);
+
+ /*------------------------------------------------------------------------*/
+ /* If necessary, byte swap each entry in the dynamic table. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ {
+ int i;
+ for (i = 0; i < num_elem; i++)
+ DLIMP_change_dynent_endian(&dyn_module->dyntab[i]);
+ }
+}
+
+/*****************************************************************************/
+/* process_target_dynamic_tag() */
+/* */
+/* Process a target specific dynamic tag entry. Returns TRUE if the tag */
+/* was handled and FALSE if it was not recognized. */
+/*****************************************************************************/
+static BOOL process_target_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i)
+{
+ return cur_target->process_dynamic_tag(dyn_module, i);
+}
+
+/*****************************************************************************/
+/* process_dynamic_table() */
+/* */
+/* Process dynamic tag entries from the dynamic table. At the conclusion */
+/* of this function, we should have made a copy of the global symbols */
+/* and the global symbol names. */
+/* */
+/*****************************************************************************/
+static BOOL process_dynamic_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ BOOL soname_found = FALSE;
+ Elf32_Addr soname_offset = 0;
+ Elf32_Addr strtab_offset = 0;
+ Elf32_Addr hash_offset = 0;
+ Elf32_Addr symtab_offset = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Iterate over the dynamic table in order to process dynamic tags. */
+ /* See ELF TIS Specification for details on the meaning of each dynamic */
+ /* tag. The C6000 ELF ABI Specification provides more details about the */
+ /* TI specific C6000 ELF ABI tags. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++)
+ {
+ switch(dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* DT_SONAME: Contains name of dynamic object, used for dependency */
+ /* comparisons. Its value is an offset from the start */
+ /* of the string table. We need to copy the string at */
+ /* this offset into dmodule->name. */
+ /*------------------------------------------------------------------*/
+ case DT_SONAME:
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Found SO_NAME.\n");
+#endif
+ /*---------------------------------------------------------------*/
+ /* We store the offset of the so_name in the dynamic string */
+ /* table so that it doesn't matter which dynamic tag we see */
+ /* first (DT_SONAME actually is generated before DT_STRTAB). */
+ /*---------------------------------------------------------------*/
+ soname_found = TRUE;
+ soname_offset = dyn_module->dyntab[i].d_un.d_ptr;
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_STRSZ: Contains the size of the string table. */
+ /*------------------------------------------------------------------*/
+ case DT_STRSZ:
+ dyn_module->strsz = dyn_module->dyntab[i].d_un.d_val;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found string table Size: 0x%x\n", dyn_module->strsz);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_STRTAB: Contains the file offset of the string table. The */
+ /* tag directly after this is guaranteed to be DT_STRSZ, */
+ /* containing the string table size. We need to */
+ /* allocate memory for the string table and copy it from */
+ /* the file. */
+ /*------------------------------------------------------------------*/
+ case DT_STRTAB:
+ strtab_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found string table: 0x%x\n", strtab_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_HASH: Contains the file offset of the symbol hash table. */
+ /*------------------------------------------------------------------*/
+ case DT_HASH:
+ hash_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found symbol hash table: 0x%x\n", hash_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_SYMTAB: Contains the file offset of the symbol table. */
+ /*------------------------------------------------------------------*/
+ case DT_SYMTAB:
+ symtab_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found symbol table: 0x%x\n", symtab_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DSO Initialization / Termination Model Dynamic Tags */
+ /*------------------------------------------------------------------*/
+ /* For initialization tags, we store indices and array sizes in */
+ /* the dyn_module. Termination works a little different, the */
+ /* indices into the local copy of the dynamic table are stored in */
+ /* dyn_module, but the DT_FINI_ARRAYSZ value is recorded with the */
+ /* loaded module. */
+ /*------------------------------------------------------------------*/
+ /* After placement is done, the DT_FINI and DT_FINI_ARRAY values */
+ /* need to be copied from the local dynamic table into the loaded */
+ /* module object. */
+ /*------------------------------------------------------------------*/
+ case DT_PREINIT_ARRAY:
+ dyn_module->preinit_array_idx = i;
+ break;
+
+ case DT_PREINIT_ARRAYSZ:
+ dyn_module->preinit_arraysz = dyn_module->dyntab[i].d_un.d_val;
+ break;
+
+ case DT_INIT:
+ dyn_module->init_idx = i;
+ break;
+
+ case DT_INIT_ARRAY:
+ dyn_module->init_array_idx = i;
+ break;
+
+ case DT_INIT_ARRAYSZ:
+ dyn_module->init_arraysz = dyn_module->dyntab[i].d_un.d_val;
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* This information will be copied over to the loaded module */
+ /* object after placement has been completed and the information */
+ /* in the dynamic table has been relocated. */
+ /*------------------------------------------------------------------*/
+ case DT_FINI_ARRAY:
+ case DT_FINI_ARRAYSZ:
+ case DT_FINI:
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* Unrecognized tag, may not be illegal, but is not explicitly */
+ /* handled by this function. Should it be? */
+ /*------------------------------------------------------------------*/
+ default:
+ {
+ if (!process_target_dynamic_tag(dyn_module, i))
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Unrecognized dynamic tag: 0x%X\n",
+ dyn_module->dyntab[i].d_tag);
+#endif
+ }
+
+ break;
+ }
+
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If string table offset and size were found, read string table in from */
+ /* the ELF object file. */
+ /*------------------------------------------------------------------------*/
+ if (strtab_offset && dyn_module->strsz)
+ {
+ DLIF_fseek(fd, strtab_offset, LOADER_SEEK_SET);
+ dyn_module->strtab = DLIF_malloc(dyn_module->strsz);
+ DLIF_fread(dyn_module->strtab, sizeof(uint8_t), dyn_module->strsz, fd);
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC,
+ "Mandatory dynamic tag DT_STRTAB/DT_STRSZ not found!\n");
+ return FALSE;
+ }
+
+
+ /*------------------------------------------------------------------------*/
+ /* If symbol hash table is found read-in the hash table. */
+ /*------------------------------------------------------------------------*/
+ if (hash_offset)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Hash table has the following format. nchain equals the number of */
+ /* entries in the symbol table (symnum) */
+ /* */
+ /* +----------------------------+ */
+ /* | nbucket | */
+ /* +----------------------------+ */
+ /* | nchain | */
+ /* +----------------------------+ */
+ /* | bucket[0] | */
+ /* | ... | */
+ /* | bucket[nbucket-1] | */
+ /* +----------------------------+ */
+ /* | chain[0] | */
+ /* | ... | */
+ /* | chain[nchain-1] | */
+ /* +----------------------------+ */
+ /*---------------------------------------------------------------------*/
+ Elf32_Word hash_nbucket;
+ Elf32_Word hash_nchain;
+
+ /*---------------------------------------------------------------------*/
+ /* Seek to the hash offset and read first two words into nbucket and */
+ /* symnum. */
+ /*---------------------------------------------------------------------*/
+ DLIF_fseek(fd, hash_offset, LOADER_SEEK_SET);
+ DLIF_fread(&(hash_nbucket), sizeof(Elf32_Word), 1, fd);
+ DLIF_fread(&(hash_nchain), sizeof(Elf32_Word), 1, fd);
+ if (dyn_module->wrong_endian)
+ {
+ DLIMP_change_endian32((int32_t*)(&(hash_nbucket)));
+ DLIMP_change_endian32((int32_t*)(&(hash_nchain)));
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* The number of entires in the dynamic symbol table is not encoded */
+ /* anywhere in the elf file. However, the nchain is guaranteed to be */
+ /* the same as the number of symbols. Use nchain to set the symnum. */
+ /*---------------------------------------------------------------------*/
+ dyn_module->symnum = hash_nchain;
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("symnum=%d\n", hash_nchain);
+#endif
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC, "Mandatory dynamic tag DT_HASH is not found!\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read dynamic symbol table. */
+ /*------------------------------------------------------------------------*/
+ if (symtab_offset)
+ {
+ int j = 0;
+ DLIF_fseek(fd, symtab_offset, LOADER_SEEK_SET);
+ dyn_module->symtab =
+ DLIF_malloc(dyn_module->symnum * sizeof(struct Elf32_Sym));
+ DLIF_fread(dyn_module->symtab, sizeof(struct Elf32_Sym),
+ dyn_module->symnum, fd);
+ if (dyn_module->wrong_endian)
+ {
+ for (j = 0; j < dyn_module->symnum; j++)
+ DLIMP_change_sym_endian(dyn_module->symtab + j);
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* The st_name field of an Elf32_Sym entity is an offset into the */
+ /* string table. Convert it into a pointer to the string. */
+ /*---------------------------------------------------------------------*/
+ if (strtab_offset)
+ for (j = 0; j < dyn_module->symnum; j++)
+ dyn_module->symtab[j].st_name += (Elf32_Word) dyn_module->strtab;
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC,
+ "Mandatory dynamic tag DT_SYMTAB is not found!\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the SONAME. */
+ /*------------------------------------------------------------------------*/
+ if (!soname_found)
+ {
+ DLIF_warning(DLWT_MISC, "Dynamic tag DT_SONAME is not found!\n");
+ dyn_module->name = DLIF_malloc(sizeof(char));
+ *dyn_module->name = '\0';
+ }
+ else
+ {
+ dyn_module->name =
+ DLIF_malloc(strlen(dyn_module->strtab + soname_offset) + 1);
+ strcpy(dyn_module->name, dyn_module->strtab + soname_offset);
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Name of dynamic object: %s\n", dyn_module->name);
+#endif
+ }
+
+ return TRUE;
+}
+
+
+/*****************************************************************************/
+/* dload_dynamic_information() */
+/* */
+/* Given a dynamic module with a dynamic segment which is located via */
+/* given dynamic segment index, make a local copy of the dynamic table */
+/* in the dynamic module object, then process the dynamic tag entries in */
+/* the table. */
+/* */
+/*****************************************************************************/
+static BOOL dload_dynamic_information(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word dyn_seg_idx)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read a copy of the dynamic table into the dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ copy_dynamic_table(fd, dyn_module, dyn_seg_idx);
+
+ /*------------------------------------------------------------------------*/
+ /* Process dynamic entries in the dynamic table. If any problems are */
+ /* encountered, the loader should emit an error or warning and return */
+ /* FALSE here. */
+ /*------------------------------------------------------------------------*/
+ return process_dynamic_table(fd, dyn_module);
+}
+
+/*****************************************************************************/
+/* check_circular_dependency() */
+/* */
+/* Determine whether a dynamic module is already in the process of being */
+/* loaded before we try to start loading it again. If it is already */
+/* being loaded, then the dynamic loader has detected a circular */
+/* dependency. An error will be emitted and the load will be aborted. */
+/* */
+/*****************************************************************************/
+static BOOL check_circular_dependency(DLOAD_HANDLE handle,
+ const char *dyn_mod_name)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check the name of the given dependency module to be loaded against the */
+ /* list of modules that are currently in the process of being loaded. */
+ /* Report an error if any circular dependencies are detected. */
+ /*------------------------------------------------------------------------*/
+ int i;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (i = 0; i < pHandle->DLIMP_module_dependency_list.size; i++)
+ if (!strcmp(dyn_mod_name,
+ ((char**)(pHandle->DLIMP_module_dependency_list.buf))[i]))
+ {
+ DLIF_error(DLET_MISC,
+ "Circular dependency detected, '%s' is already in the "
+ "process of loading.\n", dyn_mod_name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* dload_dynamic_segment() */
+/* */
+/* Find the dynamic segment in the given ELF module, if there is one. */
+/* If there is a dynamic segment, then make a local copy of the dynamic */
+/* table in the dynamic module object provided, then process the dynamic */
+/* tag entries in the table. */
+/* */
+/* If there is no dynamic segment, then we return success from this */
+/* function, marking the dynamic module as "not relocatable". */
+/* */
+/*****************************************************************************/
+static BOOL dload_dynamic_segment(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* If we don't find dynamic segment, the relocatable flag will have been */
+ /* set to false to indicate that the module is a static executable. We */
+ /* still return TRUE from this function so that we can proceed with */
+ /* static loading. */
+ /*------------------------------------------------------------------------*/
+ Elf32_Word dyn_seg_idx = 0;
+ if (!find_dynamic_segment(dyn_module, &dyn_seg_idx))
+ return TRUE;
+
+ /*------------------------------------------------------------------------*/
+ /* Process the OSABI now, after we know if the module is relocatable. */
+ /*------------------------------------------------------------------------*/
+ if (!process_eiosabi(dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Unsupported EI_OSABI value.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the dynamic table from the ELF file, then process the dynamic */
+ /* tags in the table. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_information(fd, dyn_module, dyn_seg_idx))
+ return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* Check to make sure that this module is not already being loaded. If */
+ /* is, then it will cause a circular dependency to be introduced. */
+ /* Loader should detect circular dependencies and emit an error. */
+ /*------------------------------------------------------------------------*/
+ if (!check_circular_dependency(handle, dyn_module->name))
+ return FALSE;
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* COPY_SEGMENTS() - */
+/* */
+/* Copy all segments into host memory. */
+/*****************************************************************************/
+static void copy_segments(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ int s, seg_size = dyn_module->loaded_module->loaded_segments.size;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+
+ for (s=0; s<seg_size; s++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ targ_req.fp = fp;
+ targ_req.segment = seg[s].obj_desc;
+ targ_req.offset = seg[s].phdr.p_offset;
+ targ_req.flags = DLOAD_SF_relocatable;
+
+ if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = seg[s].phdr.p_align;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy segment data from the file into host buffer where it can */
+ /* be relocated. */
+ /*---------------------------------------------------------------------*/
+ DLIF_copy(pHandle->client_handle, &targ_req);
+ seg[s].host_address = targ_req.host_address;
+ }
+}
+
+/*****************************************************************************/
+/* WRITE_SEGMENTS() - */
+/* */
+/* Write all segments to target memory. */
+/*****************************************************************************/
+static void write_segments(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC* fp,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ int s, seg_size = dyn_module->loaded_module->loaded_segments.size;
+
+ for (s=0; s<seg_size; s++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+
+ targ_req.fp = fp;
+ targ_req.segment = seg[s].obj_desc;
+ targ_req.offset = seg[s].phdr.p_offset;
+ targ_req.flags = DLOAD_SF_relocatable;
+
+ if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = seg[s].phdr.p_align;
+ targ_req.host_address = seg[s].host_address;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy segment data from the file into host buffer where it can */
+ /* be relocated. */
+ /*---------------------------------------------------------------------*/
+ DLIF_write(pHandle->client_handle, &targ_req);
+ }
+}
+
+/*****************************************************************************/
+/* SEG_HAS_SPACE_FOR_WRITE() - */
+/* */
+/* Check if segment has enough space to recieve contents of .args section. */
+/*****************************************************************************/
+static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz)
+{
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(lmodule->loaded_segments.buf);
+ int s, seg_size = lmodule->loaded_segments.size;
+
+ Elf32_Addr write_address = (Elf32_Addr)lmodule->c_args;
+
+ for (s=0; s<seg_size; s++)
+ {
+ Elf32_Addr seg_boundary =
+ seg[s].phdr.p_vaddr + seg[s].obj_desc->memsz_in_bytes;
+
+ /*---------------------------------------------------------------------*/
+ /* If address to write to is greater than segment addr and less than */
+ /* segment end, it must lie in current segment. */
+ /*---------------------------------------------------------------------*/
+ if ((write_address >= seg[s].phdr.p_vaddr) &&
+ (write_address < seg_boundary))
+ {
+ if ((write_address + sz) > seg_boundary)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Write requires 0x%x bytes\n",write_address + sz);
+ DLIF_trace("Seg boundary at : 0x%x\n",seg_boundary);
+ DLIF_trace("WARNING - Not enough space in segment\n");
+ }
+#endif
+ return FALSE;
+ }
+ else return TRUE;
+ }
+ }
+ /*------------------------------------------------------------------------*/
+ /* Given address doesn't belong to any known segment. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+
+/*****************************************************************************/
+/* DLOAD_initialize() */
+/* */
+/* Construct and initialize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* This function is deprecated, replaced by DLOAD_create(). */
+/* */
+/*****************************************************************************/
+void DLOAD_initialize(DLOAD_HANDLE handle)
+{
+}
+
+/*****************************************************************************/
+/* DLOAD_finalize() */
+/* */
+/* Destroy and finalize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* This function is deprecated, replaced by DLOAD_destroy(). */
+/* */
+/*****************************************************************************/
+void DLOAD_finalize(DLOAD_HANDLE handle)
+{
+}
+
+/*****************************************************************************/
+/* dload_static_executable() */
+/* */
+/* Account for target memory allocated to static executable and wrap up */
+/* loading. No relocation is necessary. */
+/* */
+/*****************************************************************************/
+static int32_t dload_static_executable(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int32_t local_file_handle = 0;
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Starting dload_static_executable() ...\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Set entry point for static executable and attempt to allocate target */
+ /* memory for the static executable. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->loaded_module->entry_point = dyn_module->fhdr.e_entry;
+ if (load_static_segment(handle, fd, dyn_module) &&
+ load_object(fd, dyn_module))
+ {
+ /*---------------------------------------------------------------------*/
+ /* If successful, we'll want to detach the loaded module object from */
+ /* the dynamic module object that created it. Take note of the file */
+ /* handle. */
+ /*---------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *loaded_module = detach_loaded_module(dyn_module);
+ local_file_handle = loaded_module->file_handle;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Static load failed. Flag an error. */
+ /*------------------------------------------------------------------------*/
+ else
+ DLIF_error(DLET_MEMORY,
+ "Failed to allocate target memory for static executable.\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Destruct dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Finished dload_static_executable()\n");
+#endif
+
+ return local_file_handle;
+}
+
+#if LOADER_DEBUG || LOADER_PROFILE
+int DLREL_relocations;
+time_t DLREL_total_reloc_time;
+#endif
+
+/*****************************************************************************/
+/* process_dynamic_module_relocations() */
+/* */
+/* Make a host-accessible copy of all of the segments, process all */
+/* relocation entries associated with the given module within that */
+/* space, then write the updated segment buffers back out to target */
+/* memory. */
+/* */
+/*****************************************************************************/
+static void process_dynamic_module_relocations(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+#if LOADER_DEBUG || LOADER_PROFILE
+ if(debugging_on || profiling_on)
+ {
+ DLIF_trace("Running relocate()...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Copy segments from file to host memory */
+ /*------------------------------------------------------------------------*/
+ copy_segments(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Process dynamic relocations. */
+ /*------------------------------------------------------------------------*/
+ DLREL_relocate(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Write segments from host memory to target memory */
+ /*------------------------------------------------------------------------*/
+ write_segments(handle, fd, dyn_module);
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Report timing and progress information for relocation step. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long) profile_cycle_count());
+ DLIF_trace("Total reloc time: %lu\n",
+ (unsigned long) DLREL_total_reloc_time);
+ DLIF_trace("Time per relocation: %ld\n",
+ DLREL_relocations ? DLREL_total_reloc_time / DLREL_relocations : 0);
+ }
+
+ DLIF_trace("Number of relocations: %d\n", DLREL_relocations);
+ DLIF_trace("\nAbout to run load_object()...");
+ DLREL_total_reloc_time = DLREL_relocations = 0;
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+}
+
+/*****************************************************************************/
+/* store_preinit_data() */
+/* */
+/* Given a dynamic module object, store pre-initialization function */
+/* information. The user may also provide a custom iniitialization */
+/* function that needs to be executed before the compiler */
+/* generated static initialization functions are executed. */
+/* The dynamic loader will now create a table TI_init_table to store */
+/* pre-init and init data. This is done because pre-init and */
+/* init functions could reference as-yet unrelocated symbols from other */
+/* modules. As such it is safer to store relevant function addresses and */
+/* execute them only after all modules are relocated (CQ34088). */
+/* */
+/*****************************************************************************/
+static void store_preinit_data(DLIMP_Dynamic_Module *dyn_module)
+{
+ IF_single_record *preinit_rec = NULL;
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of DT_PREINIT_ARRAY and DT_PREINIT_ARRAYSZ */
+ /* dynamic tags associated with this module. The dyn_module object will */
+ /* hold the relevant indices into the local copy of the dynamic table. */
+ /* The value of the DT_INIT_ARRAY tag will have been updated after */
+ /* placement of the module was completed. Arrays of size 0 will be */
+ /* ignored (CQ36935). */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->preinit_arraysz > 0)
+ {
+ preinit_rec = (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the .preinit_array section from the value */
+ /* of the DT_PREINIT_ARRAY tag, and store it in the TI_init_table. */
+ /*---------------------------------------------------------------------*/
+ preinit_rec->size = dyn_module->preinit_arraysz;
+ preinit_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->preinit_array_idx].d_un.d_ptr);
+ }
+
+ if (preinit_rec) IF_table_enqueue(&TI_init_table, preinit_rec);
+}
+
+/*****************************************************************************/
+/* store_init_data() */
+/* */
+/* Given a dynamic module object, save off initialization function(s) for */
+/* all global and static data objects that are defined in the module */
+/* which require construction. The dynamic loader will now create a table */
+/* TI_init_table to store pre-init and init data. This is done because */
+/* pre-init and init functions could reference as-yet unrelocated symbols */
+/* from other modules. As such it is safer to store relevant function */
+/* addresses and execute them only after all modules are relocated. */
+/* */
+/*****************************************************************************/
+static void store_init_data(DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of a DT_INIT dynamic tag associated with this */
+ /* module. The dynamic module will hold the index into the local copy of */
+ /* the dynamic table. This entry in the dynamic table will have been */
+ /* updated after placement of the module is completed. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->init_idx != -1)
+ {
+ IF_single_record *init_rec =
+ (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the initialization function from the value */
+ /* of the DT_INIT tag, and get the client to execute the function. */
+ /*---------------------------------------------------------------------*/
+ init_rec->size = 0;
+ init_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->init_idx].d_un.d_ptr);
+
+ IF_table_enqueue(&TI_init_table, init_rec);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of a DT_INIT_ARRAY and DT_INIT_ARRAYSZ dynamic tags */
+ /* associated with this module. The dyn_module object will hold the */
+ /* relevant indices into the local copy of the dynamic table. The value */
+ /* of the DT_INIT_ARRAY tag will have been updated after placement of the */
+ /* module was completed. Arraysz must be a postive number > 0, else it */
+ /* be ignored (CQ36935). */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->init_arraysz > 0)
+ {
+ IF_single_record *arr_rec =
+ (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the .init_array section from the value of */
+ /* DT_INIT_ARRAY tag. */
+ /*---------------------------------------------------------------------*/
+ arr_rec->size = dyn_module->init_arraysz;
+ arr_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->init_array_idx].d_un.d_ptr);
+
+ IF_table_enqueue(&TI_init_table, arr_rec);
+ }
+}
+
+/*****************************************************************************/
+/* execute_module_initialization() */
+/* */
+/* Given a dynamic module object, execute pre-initialization and */
+/* initialization function(s) for all global and static data objects that */
+/* are defined in the module which require construction. The user may */
+/* also provide a custom iniitialization function that needs to be */
+/* executed before the compiler generated static initialization functions */
+/* are executed. */
+/* Note that the functions to be executed have already been saved off in */
+/* the TI_init_table, by store_preinit_data() and store_init_data(). */
+/* */
+/*****************************************************************************/
+static void execute_module_initialization(DLOAD_HANDLE handle)
+{
+ IF_single_record *val = NULL;
+ IF_table_Queue_Node *curr_ptr = TI_init_table.front_ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (; curr_ptr; curr_ptr = curr_ptr->next_ptr)
+ {
+ val = curr_ptr->value;
+
+ /*---------------------------------------------------------------------*/
+ /* A size of 0 indicates DT_INIT, otherwise this is an ARRAY. */
+ /*---------------------------------------------------------------------*/
+ if (val->size != 0)
+ {
+ /*------------------------------------------------------------------*/
+ /* Now make a loader-accessible copy of the .init_array section. */
+ /*------------------------------------------------------------------*/
+ int32_t i;
+ int32_t num_init_fcns = val->size/sizeof(TARGET_ADDRESS);
+ TARGET_ADDRESS *init_array_buf = (TARGET_ADDRESS *)
+ DLIF_malloc(val->size);
+
+ DLIF_read(pHandle->client_handle,
+ init_array_buf, 1, val->size,
+ (TARGET_ADDRESS)val->sect_addr);
+
+ /*------------------------------------------------------------------*/
+ /* Call each function whose address occupies an entry in array in */
+ /* the order that they appear in the array. The size of the array is*/
+ /* provided by the init_arraysz field in the dynamic module (copied */
+ /* earlier when the dynamic table was read in). Make sure that */
+ /* function addresses are valid before execution. */
+ /*------------------------------------------------------------------*/
+ for (i = 0; i < num_init_fcns; i++)
+ if (init_array_buf[i])
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(init_array_buf[i]));
+ else
+ DLIF_warning(DLWT_MISC,
+ "DT_INIT_ARRAY/DT_PREINIT_ARRAY function address is NULL!");
+
+ DLIF_free(init_array_buf);
+ }
+ else
+ {
+ if (val->sect_addr)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(val->sect_addr));
+ else
+ DLIF_warning(DLWT_MISC, "DT_INIT function address is NULL!");
+ }
+ }
+}
+
+/*****************************************************************************/
+/* adjust_module_init_fini() */
+/* If the dynamic loader need not process the module initialization */
+/* and termination (fini section) then adjust the module info so that */
+/* the respective sizes become zero. */
+/*****************************************************************************/
+static void adjust_module_init_fini(DLIMP_Dynamic_Module *dm)
+{
+ /*------------------------------------------------------------------------*/
+ /* The C6x RTS boot code has the function _c_int00 which performs */
+ /* the C/C++ initialization. This function processes the .init_array */
+ /* to perform the C/C++ initialization and handles termination through */
+ /* the at_exit functionality. If the dynamic executable we are loading */
+ /* includes _c_int00, the loader assumes that the application code takes */
+ /* care of all initialization and termination. Hence the loader won't */
+ /* perform the initialization and termination. */
+ /* NOTE: Use of __TI_STACK_SIZE is a hack. The _c_int00 symbol is not */
+ /* in the dynamic symbol table. The right fix is for the linker */
+ /* not to generate the init array tags if the build includes RTS */
+ /* boot routine. */
+ /*------------------------------------------------------------------------*/
+ if (dm->fhdr.e_type == ET_EXEC &&
+ DLSYM_lookup_local_symtab("__TI_STACK_SIZE", dm->symtab, dm->symnum,
+ NULL))
+ {
+ dm->init_arraysz = 0;
+ dm->init_array_idx = -1;
+
+ dm->preinit_arraysz = 0;
+ dm->preinit_array_idx = -1;
+
+ dm->loaded_module->fini_arraysz = 0;
+ dm->loaded_module->fini_array = (Elf32_Addr) NULL;
+ dm->loaded_module->fini = (Elf32_Addr) NULL;
+ }
+}
+
+/*****************************************************************************/
+/* relocate_dependency_graph_modules() */
+/* */
+/* For each dynamic module on the dependency stack, process dynamic */
+/* relocation entries then perform initialization for all global and */
+/* static objects that are defined in tha given module. The stack is */
+/* emptied from the top (LIFO). Each dynamic module object is popped */
+/* off the top of the stack, the module gets relocated, its global and */
+/* static objects that need to be constructed will be constructed, and */
+/* then, after detaching the loaded module object from its dynamic */
+/* module, the dynamic module object is destructed. */
+/* */
+/*****************************************************************************/
+static
+int32_t relocate_dependency_graph_modules(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Processing of relocations will only be triggered when this function */
+ /* is called from the top-level object module (at the bottom of the */
+ /* dependency graph stack). */
+ /*------------------------------------------------------------------------*/
+ int32_t local_file_handle = dyn_module->loaded_module->file_handle;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ dynamic_module_ptr_Stack_Node *ptr =
+ pHandle->DLIMP_dependency_stack.bottom_ptr;
+ if (ptr && (ptr->value != dyn_module)) return local_file_handle;
+
+ if (is_dsbt_module(dyn_module))
+ {
+ /*--------------------------------------------------------------------*/
+ /* Assign DSBT indices. */
+ /*--------------------------------------------------------------------*/
+ DLIF_assign_dsbt_indices();
+
+ /*--------------------------------------------------------------------*/
+ /* Update the content of all DSBTs for any module that uses the */
+ /* DSBT model. */
+ /*--------------------------------------------------------------------*/
+ DLIF_update_all_dsbts();
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Ok, we are ready to process relocations. The relocation tables */
+ /* associated with dependent files will be processed first. Consume */
+ /* dynamic module objects from the dependency graph stack from dependents */
+ /* to the root of the dependency graph. */
+ /*------------------------------------------------------------------------*/
+ while (pHandle->DLIMP_dependency_stack.size > 0)
+ {
+ DLIMP_Dynamic_Module *dyn_mod_ptr =
+ dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack);
+
+ /*---------------------------------------------------------------------*/
+ /* Process dynamic relocations associated with this module. */
+ /*---------------------------------------------------------------------*/
+ process_dynamic_module_relocations(handle, dyn_mod_ptr->fd, dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there */
+ /* is one. Record this pointer in the ELF file internal data object. */
+ /* Also store this in the loaded module, since this will be needed to */
+ /* write argv, argc to .args at execution time. */
+ /*---------------------------------------------------------------------*/
+ DLSYM_lookup_local_symtab("__c_args__", dyn_mod_ptr->symtab,
+ dyn_mod_ptr->symnum,
+ (Elf32_Addr *)&dyn_mod_ptr->c_args);
+ dyn_mod_ptr->loaded_module->c_args = dyn_mod_ptr->c_args;
+
+ /*---------------------------------------------------------------------*/
+ /* Pick up entry point address from ELF file header. */
+ /* We currently only support a single entry point into the ELF file. */
+ /* To support Braveheart notion of nodes, with multiple entry points,*/
+ /* we'll need to get the list of entry points associated with a node,*/
+ /* then add capability to the "execute" command to select the entry */
+ /* point that we want to start executing from. */
+ /*---------------------------------------------------------------------*/
+ dyn_mod_ptr->loaded_module->entry_point = dyn_mod_ptr->fhdr.e_entry;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy command-line arguments into args section and deal with DSBT */
+ /* issues (copy DSBT to its run location). */
+ /* Note that below function is commented out because this doesn't do */
+ /* much as of now. */
+ /*---------------------------------------------------------------------*/
+ //load_object(dyn_mod_ptr->fd, dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* Perform initialization, if needed, for this module. */
+ /*---------------------------------------------------------------------*/
+ store_init_data(dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* Free all dependent file pointers. */
+ /*---------------------------------------------------------------------*/
+ if (dyn_mod_ptr->fd != fd)
+ {
+ DLIF_fclose(dyn_mod_ptr->fd);
+ dyn_mod_ptr->fd = NULL;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Detach loaded module object from the dynamic module object that */
+ /* created it, then throw away the dynamic module object. */
+ /*---------------------------------------------------------------------*/
+ detach_loaded_module(dyn_mod_ptr);
+ delete_DLIMP_Dynamic_Module(handle, &dyn_mod_ptr);
+ }
+
+ return local_file_handle;
+}
+
+/*****************************************************************************/
+/* DLOAD_load() */
+/* */
+/* Dynamically load the specified file and return a file handle for the */
+/* loaded file. If the load fails, this function will return a value of */
+/* zero (0) for the file handle. */
+/* */
+/* The core loader must have read access to the file pointed to by fd. */
+/* */
+/*****************************************************************************/
+int32_t DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd)
+{
+ int32_t fl_handle;
+
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd);
+
+ if (!dyn_module)
+ return 0;
+
+#if LOADER_DEBUG
+ /*------------------------------------------------------------------------*/
+ /* Spit out some loader progress information when we begin loading an */
+ /* object. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on) DLIF_trace("Loading file...\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* If no access to a program was provided, there is nothing to do. */
+ /*------------------------------------------------------------------------*/
+ if (!fd)
+ {
+ DLIF_error(DLET_FILE, "Missing file specification.\n");
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read file headers and dynamic information into dynamic module. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_headers(fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Find the dynamic segment, if there is one, and read dynamic */
+ /* information from the ELF object file into the dynamic module data */
+ /* structure associated with this file. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_segment(handle, fd, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Perform sanity checking on the read-in ELF file. */
+ /*------------------------------------------------------------------------*/
+ if (!is_valid_elf_object_file(fd, dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ return 0;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Stop clock on initialization of ELF file information. Start clock on */
+ /* initialization of ELF module. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished dload_dynamic_segment.\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long) profile_cycle_count());
+ }
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize internal ELF module and segment structures. Sets */
+ /* loaded_module in *dyn_module. This also deals with assigning a file */
+ /* handle and bumping file handle counter. */
+ /*------------------------------------------------------------------------*/
+ initialize_loaded_module(handle, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Append Module structure to loaded object list. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects,
+ dyn_module->loaded_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Support static loading as special case. */
+ /*------------------------------------------------------------------------*/
+ if (!dyn_module->relocatable)
+ return dload_static_executable(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Get space & address for segments, and offset symbols and program */
+ /* header table to reflect the relocated address. Also offset the */
+ /* addresses in the internal Segment structures used by the Module */
+ /* structure. Note that this step needs to be performed prior and in */
+ /* addition to the relocation entry processing. */
+ /*------------------------------------------------------------------------*/
+ if (!allocate_dynamic_segments_and_relocate_symbols(handle, fd, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there is */
+ /* one. __TI_STATIC_BASE points to the beginning of the DP-relative data */
+ /* segment (value to initialize DP). Record these addresses in the ELF */
+ /* file internal data object. */
+ /*------------------------------------------------------------------------*/
+ DLSYM_lookup_local_symtab("__c_args__", dyn_module->symtab,
+ dyn_module->symnum,
+ (Elf32_Addr *)&dyn_module->c_args);
+
+ DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum,
+ (Elf32_Addr *)&dyn_module->static_base);
+ dyn_module->loaded_module->static_base = dyn_module->static_base;
+
+ /*------------------------------------------------------------------------*/
+ /* If the user application performs initialization and termination, */
+ /* the dynamic loader shouldn't process the init/fini sections. */
+ /* Check and adjust the init/fini information accordingly. */
+ /*------------------------------------------------------------------------*/
+ adjust_module_init_fini(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Execute any user defined pre-initialization functions that may be */
+ /* associated with a dynamic executable module. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->fhdr.e_type == ET_EXEC)
+ store_preinit_data(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Append current ELF file to list of objects currently loading. */
+ /* This is used to detect circular dependencies while we are processing */
+ /* the dependents of this file. */
+ /*------------------------------------------------------------------------*/
+ AL_append(&pHandle->DLIMP_module_dependency_list, &dyn_module->name);
+
+ /*------------------------------------------------------------------------*/
+ /* Push this dynamic module object onto the dependency stack. */
+ /* All of the modules on the stack will get relocated after all of the */
+ /* dependent files have been loaded and allocated. */
+ /*------------------------------------------------------------------------*/
+ dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* If this object file uses the DSBT model, then register a DSBT index */
+ /* request with the client's DSBT support management. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module) &&
+ !DLIF_register_dsbt_index_request(handle,
+ dyn_module->name,
+ dyn_module->loaded_module->file_handle,
+ dyn_module->dsbt_index))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Load this ELF file's dependees (all files on its DT_NEEDED list). */
+ /* Dependees must be loaded and relocated before processing this module's */
+ /* relocations. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_and_allocate_dependencies(handle, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Remove the current ELF file from the list of files that are in the */
+ /* process of loading. */
+ /*------------------------------------------------------------------------*/
+ pHandle->DLIMP_module_dependency_list.size--;
+
+ /*------------------------------------------------------------------------*/
+ /* Process relocation entries. */
+ /*------------------------------------------------------------------------*/
+ fl_handle = relocate_dependency_graph_modules(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* With initialization complete, and all relocations having been resolved */
+ /* do module initialization. */
+ /*------------------------------------------------------------------------*/
+ execute_module_initialization(handle);
+
+ return fl_handle;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_entry_names() */
+/* */
+/* Build a list of entry point names for a loaded object. Currently, */
+/* any global symbol in the module is considered a valid entry point */
+/* regardless of whether it is defined in code or associated with a */
+/* data object. We would need to process the content of the symbol */
+/* table entry or its debug information to determine whether it is a */
+/* valid entry point or not. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle,
+ uint32_t file_handle,
+ int32_t *entry_pt_cnt,
+ char ***entry_pt_names)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then build a list of entry points from that file's */
+ /* symbol table. */
+ /*------------------------------------------------------------------------*/
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ loaded_module_ptr_Queue_Node* ptr;
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ if (ptr->value->file_handle == file_handle)
+ {
+ DLIMP_Loaded_Module *module = ptr->value;
+ struct Elf32_Sym *symtab;
+ int i;
+
+ /*------------------------------------------------------------------*/
+ /* Any symbol in our file's symbol table is considered a valid */
+ /* entry point. */
+ /*------------------------------------------------------------------*/
+ symtab = (struct Elf32_Sym*)module->gsymtab;
+ *entry_pt_cnt = module->gsymnum;
+ *entry_pt_names = DLIF_malloc(*entry_pt_cnt * sizeof(char*));
+ for (i = 0; i < module->gsymnum; i++)
+ {
+ const char *sym_name = (const char *)symtab[i].st_name;
+ **entry_pt_names = DLIF_malloc(strlen(sym_name) + 1);
+ strcpy(**entry_pt_names,sym_name);
+ }
+
+ return TRUE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_prepare_for_execution() */
+/* */
+/* Given a file handle, prepare for execution : */
+/* - Return entry point associated with that module in the *sym_val */
+/* output parameter. */
+/* - Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* - As a test (for the Reference implementation) read the arguments */
+/* using the DLIF_read_arguments() function and set global argc,argv. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val,
+ int argc, char** argv)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *ep_loaded_module;
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point);
+ ep_loaded_module = ptr->value;
+
+ /*------------------------------------------------------------------*/
+ /* Write argc, argv to the .args section in this module. */
+ /*------------------------------------------------------------------*/
+ if (!write_arguments_to_args_section(handle, argc, argv,
+ ep_loaded_module))
+ {
+ DLIF_error(DLET_MISC, "Couldn't write to .args section\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------*/
+ /* For the Reference Implementation we simulate a "boot" (rts boot */
+ /* routine reads argc, argv from .args), by reading argc, argv from */
+ /* .args section. Note that we just wrote these values to the .args */
+ /* so this read serves as a test for the Reference Implementation. */
+ /*------------------------------------------------------------------*/
+ read_args_from_section(ep_loaded_module);
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_load_arguments() */
+/* */
+/* Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle,
+ int argc, char** argv)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *ep_loaded_module;
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ ep_loaded_module = ptr->value;
+
+ /*------------------------------------------------------------------*/
+ /* Write argc, argv to the .args section in this module. */
+ /*------------------------------------------------------------------*/
+ if (!write_arguments_to_args_section(handle, argc, argv,
+ ep_loaded_module))
+ {
+ DLIF_error(DLET_MISC, "Couldn't write to .args section\n");
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_entry_point() */
+/* */
+/* Given a file handle, return the entry point associated with that */
+/* module in the *sym_val output parameter. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point);
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_query_symbol() */
+/* */
+/* Query the value of a global symbol from a specific file. The value */
+/* result will be written to *sym_val. The function returns TRUE if the */
+/* symbol was found, and FALSE if it wasn't. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_query_symbol(DLOAD_HANDLE handle,
+ uint32_t file_handle,
+ const char *sym_name,
+ TARGET_ADDRESS *sym_val)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the value (target address) associated */
+ /* with the symbol we are looking for in that file. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ if (ptr->value->file_handle == file_handle)
+ {
+ DLIMP_Loaded_Module *module = ptr->value;
+ struct Elf32_Sym *symtab;
+ int i;
+
+ /*------------------------------------------------------------------*/
+ /* Search through the symbol table by name. */
+ /*------------------------------------------------------------------*/
+ symtab = (struct Elf32_Sym*)module->gsymtab;
+ for(i=0; i < module->gsymnum; i++)
+ {
+ if (!strcmp(sym_name, (const char *)symtab[i].st_name))
+ {
+ *sym_val = (TARGET_ADDRESS) symtab[i].st_value;
+ return TRUE;
+ }
+ }
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the symbol we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+
+
+/*****************************************************************************/
+/* unlink_loaded_module() */
+/* */
+/* Unlink a loaded module data object from the list of loaded objects, */
+/* returning a pointer to the object so that it can be deconstructed. */
+/* */
+/*****************************************************************************/
+static DLIMP_Loaded_Module *unlink_loaded_module(DLOAD_HANDLE handle,
+ loaded_module_ptr_Queue_Node *back_ptr,
+ loaded_module_ptr_Queue_Node *lm_node)
+{
+ DLIMP_Loaded_Module *loaded_module = lm_node->value;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ loaded_module_ptr_remove(&pHandle->DLIMP_loaded_objects, lm_node->value);
+ return loaded_module;
+}
+
+/*****************************************************************************/
+/* execute_module_termination() */
+/* */
+/* Execute termination functions associated with this loaded module. */
+/* Termination functions are called in the reverse order as their */
+/* corresponding initialization functions. */
+/* */
+/*****************************************************************************/
+static void execute_module_termination(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Module *loaded_module)
+{
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* If a DT_FINI_ARRAY dynamic tag was encountered for this module, spin */
+ /* through the array in reverse order, calling each function address */
+ /* stored in the array. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->fini_arraysz != 0)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Now make a loader-accessible copy of the .fini_array section. */
+ /*---------------------------------------------------------------------*/
+ int32_t i;
+ int32_t num_fini_fcns =
+ loaded_module->fini_arraysz/sizeof(TARGET_ADDRESS);
+ TARGET_ADDRESS *fini_array_buf = (TARGET_ADDRESS *)
+ DLIF_malloc(loaded_module->fini_arraysz);
+
+ DLIF_read(pHandle->client_handle,
+ fini_array_buf, 1, loaded_module->fini_arraysz,
+ (TARGET_ADDRESS)loaded_module->fini_array);
+
+ /*---------------------------------------------------------------------*/
+ /* Now spin through the array in reverse order, executing each */
+ /* termination function whose address occupies an entry in the array. */
+ /*---------------------------------------------------------------------*/
+ for (i = num_fini_fcns - 1; i >= 0; i--)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(fini_array_buf[i]));
+
+ DLIF_free(fini_array_buf);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If a DT_FINI dynamic tag was encountered for this module, call the */
+ /* function indicated by the tag's value to complete the termination */
+ /* process for this module. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->fini != (Elf32_Addr) NULL)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)loaded_module->fini);
+}
+
+/*****************************************************************************/
+/* remove_loaded_module() */
+/* */
+/* Find and unlink a loaded module data object from the list of loaded */
+/* objects, then call its destructor to free the host memory associated */
+/* with the loaded module and all of its loaded segments. */
+/* */
+/*****************************************************************************/
+static void remove_loaded_module(DLOAD_HANDLE handle,
+ loaded_module_ptr_Queue_Node *lm_node)
+{
+ DLIMP_Loaded_Module *lm_object = NULL;
+ loaded_module_ptr_Queue_Node *back_ptr = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ if (lm_node != pHandle->DLIMP_loaded_objects.front_ptr)
+ for (back_ptr = pHandle->DLIMP_loaded_objects.front_ptr;
+ back_ptr->next_ptr != lm_node;
+ back_ptr = back_ptr->next_ptr);
+
+ lm_object = unlink_loaded_module(handle, back_ptr, lm_node);
+
+ delete_DLIMP_Loaded_Module(handle, &lm_object);
+}
+
+/*****************************************************************************/
+/* DLOAD_unload() */
+/* */
+/* Unload specified module (identified by its file handle) from target */
+/* memory. Free up any target memory that was allocated for the module's */
+/* segments and also any host heap memory that was allocated for the */
+/* internal module and segment data structures. */
+/* */
+/* Return TRUE if program entry is actually destroyed. This is a way of */
+/* communicating to the client when it needs to actually remove debug */
+/* information associated with this module (so that client does not have */
+/* to maintain a use count that mirrors the program entry). */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t file_handle)
+{
+ loaded_module_ptr_Queue_Node* lm_node;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (lm_node = pHandle->DLIMP_loaded_objects.front_ptr; lm_node != NULL;
+ lm_node = lm_node->next_ptr)
+ {
+ if (lm_node->value->file_handle == file_handle)
+ {
+ --lm_node->value->use_count;
+ if (lm_node->value->use_count == 0)
+ {
+ DLIMP_Loaded_Module *loaded_module =
+ (DLIMP_Loaded_Module *)lm_node->value;
+ int j;
+ int *dep_file_handles;
+
+ /*---------------------------------------------------------------*/
+ /* Termination functions need to be executed in the reverse */
+ /* order as the corresponding initialization functions, so */
+ /* before we go unload this module's dependents, we need to */
+ /* perform the user/global/static termination functions */
+ /* associated with this module. */
+ /*---------------------------------------------------------------*/
+ execute_module_termination(handle, loaded_module);
+
+ /*---------------------------------------------------------------*/
+ /* Unload dependent modules via the client. Client needs to know */
+ /* when a dependent gets unloaded so that it can update debug */
+ /* information. */
+ /*---------------------------------------------------------------*/
+ dep_file_handles = (int*)(loaded_module->dependencies.buf);
+ for (j = 0; j < loaded_module->dependencies.size; j++)
+ DLIF_unload_dependent(pHandle->client_handle,
+ dep_file_handles[j]);
+
+ /*---------------------------------------------------------------*/
+ /* Find the predecessor node of the value we're deleting, */
+ /* because its next_ptr will need to be updated. */
+ /* */
+ /* We can't keep a back pointer around because */
+ /* DLIF_unload_dependent() might free that node, making our */
+ /* pointer invalid. Turn the Queue template into a doubly */
+ /* linked list if this overhead becomes a problem. */
+ /*---------------------------------------------------------------*/
+ remove_loaded_module(handle, lm_node);
+
+ /*---------------------------------------------------------------*/
+ /* Once unloading is done, reset virtual target to NULL. */
+ /*---------------------------------------------------------------*/
+ cur_target = NULL;
+
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_load_symbols() */
+/* */
+/* Load the symbols from the given file and make symbols available for */
+/* global symbol linkage. */
+/* */
+/*****************************************************************************/
+int32_t DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd)
+{
+ DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd);
+ DLIMP_Loaded_Module *loaded_module = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Ensure we have a valid dynamic module object from the constructor. */
+ /*------------------------------------------------------------------------*/
+ if (!dyn_module)
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* If no access to a program was provided, there is nothing to do. */
+ /*------------------------------------------------------------------------*/
+ if (!fd)
+ {
+ DLIF_error(DLET_FILE, "Missing file specification.\n");
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Record argc and argv pointers with the dynamic module record. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->argc = 0;
+ dyn_module->argv = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Read file headers and dynamic information into dynamic module. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_headers(fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Find the dynamic segment, if there is one, and read dynamic */
+ /* information from the ELF object file into the dynamic module data */
+ /* structure associated with this file. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_segment(handle, fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Perform sanity checking on the read-in ELF file. */
+ /*------------------------------------------------------------------------*/
+ if (!is_valid_elf_object_file(fd, dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize internal ELF module and segment structures. Sets */
+ /* loaded_module in *dyn_module. This also deals with assigning a file */
+ /* handle and bumping file handle counter. */
+ /*------------------------------------------------------------------------*/
+ initialize_loaded_module(handle, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Add this module to the loaded module queue. */
+ /* Detach the loaded module object from the dynamic module thath created */
+ /* it. Ownership of the host memory allocated for the loaded module */
+ /* object now belongs to the DLIMP_loaded_objects list. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects,
+ dyn_module->loaded_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Register a DSBT index request for this module and update its own copy */
+ /* of the DSBT with the contents of the client's master DSBT. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module))
+ {
+ dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module);
+ DLIF_register_dsbt_index_request(handle,
+ dyn_module->name,
+ dyn_module->loaded_module->file_handle,
+ dyn_module->dsbt_index);
+ DLIF_assign_dsbt_indices();
+ DLIF_update_all_dsbts();
+ dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Ownership of the host memory allocated for the loaded module object is */
+ /* transferred to the DLIMP_loaded_objects list. Free up the host memory */
+ /* for the dynamic module that created the loaded module object. Just */
+ /* call the destructor function for DLIMP_Dynamic_Module. */
+ /*------------------------------------------------------------------------*/
+ loaded_module = detach_loaded_module(dyn_module);
+ if(loaded_module == NULL)
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Return a file handle so that the client can match this file to an ID. */
+ /*------------------------------------------------------------------------*/
+ return loaded_module->file_handle;
+}
+
+/*****************************************************************************/
+/* DSBT Support Functions */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* DLOAD_get_dsbt_size() */
+/* */
+/* Find the amount of space allocated for the specified module's DSBT. */
+/* It must be big enough to hold a copy of the master DSBT or the client */
+/* will flag an error. Those modules whose DSBT size is zero are assumed */
+/* to not be using the DSBT model. */
+/* */
+/*****************************************************************************/
+uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle)
+{
+ dynamic_module_ptr_Stack_Node *ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Dynamic_Module *dmp = ptr->value;
+ if (dmp->loaded_module->file_handle == file_handle)
+ return dmp->dsbt_size;
+ }
+
+ return 0;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_static_base() */
+/* */
+/* Look up static base symbol associated with the specified module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *static_base)
+{
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Loaded_Module *lmp = ptr->value;
+ if (lmp->file_handle == file_handle)
+ {
+ *static_base = (TARGET_ADDRESS)lmp->static_base;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_dsbt_base() */
+/* */
+/* Look up address of DSBT for the specified module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle, TARGET_ADDRESS *dsbt_base)
+{
+ dynamic_module_ptr_Stack_Node *ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Dynamic_Module *dmp = ptr->value;
+ if (dmp->loaded_module->file_handle == file_handle)
+ {
+ *dsbt_base =
+ (TARGET_ADDRESS)dmp->dyntab[dmp->dsbt_base_tagidx].d_un.d_ptr;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */
+/* file that we are in the process of loading and relocating. */
+/*****************************************************************************/
+void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC* elf_file,
+ DLIMP_Dynamic_Module* dyn_module)
+
+{
+ cur_target->relocate(handle, elf_file, dyn_module);
+}
+
+/*****************************************************************************/
+/* GET_VT_OBJ() - Once file headers have been read, use the e_machine id to */
+/* figure out the virtul target, so we can access trg specific funcs. */
+/*****************************************************************************/
+static VIRTUAL_TARGET *get_vt_obj(int given_id)
+{
+ VIRTUAL_TARGET *ptr;
+
+ for(ptr = vt_arr; ptr->machine_id != EM_NONE ; ptr++)
+ if (ptr->machine_id == given_id) return ptr;
+
+ return NULL;
+}
+
+#if 0 && LOADER_DEBUG // enable to make available in debugger
+/*****************************************************************************/
+/* DEBUG_QUEUE() - Debug function. */
+/*****************************************************************************/
+static void debug_queue(LOADER_OBJECT *pHandle, char* position)
+{
+ loaded_module_ptr_Queue_Node* ptr;
+
+ if (!debugging_on) return;
+
+ DLIF_trace ("\nDEBUG QUEUE : %s, pHandle : 0x%x\n\n", position,
+ (uint32_t)pHandle);
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIF_trace ("ptr->value->name : %s\n",ptr->value->name);
+ }
+ DLIF_trace ("\n");
+}
+#endif
+
+/*****************************************************************************/
+/* READ_ARGS_FROM_SECTION() - This function reads the argc, argv from the */
+/* .args section, and is used to test Reference implementation. */
+/*****************************************************************************/
+static void read_args_from_section(DLIMP_Loaded_Module* ep_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Before this function in called, the loader has gotten argv/argc from */
+ /* the module and written it out to the .args section. c_args points to */
+ /* the .args section. */
+ /*------------------------------------------------------------------------*/
+ ARGS_CONTAINER *pargs = (ARGS_CONTAINER *)(ep_module->c_args);
+ if (!pargs || pargs == (ARGS_CONTAINER *)0xFFFFFFFF)
+ {
+ global_argc = 0;
+ global_argv = NULL;
+ }
+ else
+ {
+ global_argc = pargs->argc;
+ global_argv = pargs->argv;
+ }
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/dload.h b/src/core/dsp/ocl_load/DLOAD/dload.h
new file mode 100644
index 0000000..bb7d427
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload.h
@@ -0,0 +1,334 @@
+/*
+* dload.h
+*
+* Define internal data structures used by core dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_H
+#define DLOAD_H
+
+#include "ArrayList.h"
+#include "Queue.h"
+#include "Stack.h"
+#include "elf32.h"
+#include "dload_api.h"
+#include "util.h"
+
+/*---------------------------------------------------------------------------*/
+/* Contains strings with names of files the loader is in process of loading. */
+/* This list is used to keep track of what objects are in the process of */
+/* loading while their dependents are being loaded so that we can detect */
+/* circular dependencies. */
+/*---------------------------------------------------------------------------*/
+extern Array_List DLIMP_module_dependency_list;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Loaded_Segment */
+/* */
+/* This structure represents a segment loaded on memory. */
+/* */
+/* This data structure should be created using host memory when a module */
+/* is being loaded into target memory. The data structure should persist */
+/* as long as the module stays resident in target memory. It should be */
+/* removed when the last use of the module is unloaded from the target. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ struct Elf32_Phdr phdr;
+ Elf32_Addr input_vaddr; /* original segment load addr */
+ BOOL modified;
+ struct DLOAD_MEMORY_SEGMENT *obj_desc;
+ void * host_address;
+} DLIMP_Loaded_Segment;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Loaded_Module */
+/* */
+/* This structure contains all the information the dynamic loader needs */
+/* to retain after loading an object file's segments into target memory. */
+/* The data structure is created while the object file is being loaded, */
+/* and should persist until the last use of the module is unloaded from */
+/* target memory. */
+/* */
+/* The information contained here is used by the dynamic loader to */
+/* perform dynamic symbol resolution, to track the use count, and to */
+/* finally deallocate the module's segments when the module is unloaded. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ char *name; /* Local copy of so_name */
+ int32_t file_handle;
+ int32_t use_count;
+ Elf32_Addr entry_point; /* Entry point address into module */
+ struct Elf32_Sym *gsymtab; /* Module's global symbol table */
+ Elf32_Word gsymnum; /* # global symbols */
+ char *gstrtab; /* Module's global symbol names */
+ Elf32_Word gstrsz; /* Size of global string table */
+ Array_List loaded_segments; /* List of DLIMP_Loaded_Segment(s) */
+ Array_List dependencies; /* List of dependent file handles */
+ BOOL direct_dependent_only;
+
+ Elf32_Addr fini; /* .fini function/section address */
+ Elf32_Addr fini_array; /* .fini_array term fcn ary addr */
+ int32_t fini_arraysz; /* sizeof .fini_array */
+ uint8_t *c_args; /* address of module's .args sect */
+ uint8_t *static_base; /* address of module's STATIC_BASE */
+
+} DLIMP_Loaded_Module;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_loaded_objects */
+/* */
+/* A list of loaded module objects (DLIMP_Loaded_Module *) that the */
+/* loader has placed into target memory. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_DEFINITION(DLIMP_Loaded_Module*, loaded_module_ptr)
+extern loaded_module_ptr_Queue DLIMP_loaded_objects;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Dynamic_Module */
+/* */
+/* This structure represents a dynamic module to be loaded by the dynamic */
+/* loader. It contains all the information necessary to load and relocate */
+/* the module. It actually contains most of the headers, dynamic info, */
+/* dynamic symbol table, string table etc. */
+/* */
+/* This structure is allocated in host memory while an ELF object file is */
+/* being loaded and will be destructed after the file has been */
+/* successfully loaded. To simplify loading and relocation of the object */
+/* file's segments, this data structure maintains a link to the loaded */
+/* module. This link is severed when the load is successfully completed. */
+/* The loaded module data structure will persist until the module is */
+/* actually unloaded from target memory, but this data structure will be */
+/* freed. */
+/* */
+/* If the load of the object file is not successful for any reason, then */
+/* the loaded module will not be detached from the dynamic module. In */
+/* such case, the destructor for the dynamic module will assume */
+/* responsibility for freeing any host memory associated with the loaded */
+/* module and its segments. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ char *name; /* Local copy of so_name */
+ LOADER_FILE_DESC *fd; /* Access to ELF object file */
+ struct Elf32_Ehdr fhdr; /* ELF Object File Header */
+ struct Elf32_Phdr *phdr; /* ELF Program Header Table */
+ Elf32_Word phnum; /* # entries in program header table */
+ char* strtab; /* String Table */
+ Elf32_Word strsz; /* String Table size in bytes */
+ struct Elf32_Dyn *dyntab; /* Elf Dynamic Table (.dynamic scn) */
+ /* This contains a list of dynamic */
+ /* tags which is terminated by a NULL */
+ /* record. */
+ struct Elf32_Sym *symtab; /* Elf Dynamic Symbol Table */
+ Elf32_Word symnum; /* # symbols in dynamic symbol table */
+ Elf32_Word gsymtab_offset;/* Offset into symbol table where */
+ /* global symbols start. */
+ Elf32_Word gstrtab_offset;/* Offset into string table where */
+ /* global symbol names start. */
+
+ uint8_t *c_args;
+ uint8_t *static_base; /* address of module's STATIC_BASE */
+ int32_t argc;
+ char **argv;
+ DLIMP_Loaded_Module *loaded_module;
+ int32_t wrong_endian;
+ BOOL direct_dependent_only;
+ BOOL relocatable; /* TRUE if module can be relocated */
+ /* at load-time. FALSE if module is */
+ /* a static executable. */
+ BOOL relocate_entry_point; /* TRUE if the entry point has */
+ /* not been relocated */
+
+ int32_t dsbt_index; /* DSBT index requested/assigned */
+ uint32_t dsbt_size; /* DSBT size for this module */
+ int32_t dsbt_base_tagidx;/* Location of DSBT base dyn tag */
+
+ int32_t preinit_array_idx; /* DT_PREINIT_ARRAY dyn tag loc */
+ int32_t preinit_arraysz; /* sizeof pre-init array */
+ int32_t init_idx; /* DT_INIT dynamic tag location */
+ int32_t init_array_idx; /* DT_INIT_ARRAY dyn tag location */
+ int32_t init_arraysz; /* sizeof init array */
+
+} DLIMP_Dynamic_Module;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_dependency_stack */
+/* */
+/* A LIFO stack of dynamic module objects (DLIMP_Dynamic_Module *) that */
+/* is retained while dependent files are being loaded and allocated. It */
+/* is used to guide which dynamic modules need to be relocated after all */
+/* items in the dependency graph have been allocated. The stack is only */
+/* used when the client asks the core loader to load a dynamic executable */
+/* or library. When relocation is completed, this stack should be empty. */
+/*---------------------------------------------------------------------------*/
+TYPE_STACK_DEFINITION(DLIMP_Dynamic_Module*, dynamic_module_ptr)
+extern dynamic_module_ptr_Stack DLIMP_dependency_stack;
+
+/*---------------------------------------------------------------------------*/
+/* Private Loader Object instance. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ /*-----------------------------------------------------------------------*/
+ /* Contains filenames (type const char*) the system is in the process of */
+ /* loading. Used to detect cycles in incorrectly compiled ELF binaries. */
+ /*-----------------------------------------------------------------------*/
+ Array_List DLIMP_module_dependency_list;
+
+ /*-----------------------------------------------------------------------*/
+ /* Contains objects (type DLIMP_Loaded_Module) that the system has loaded*/
+ /* into target memory. */
+ /*-----------------------------------------------------------------------*/
+ loaded_module_ptr_Queue DLIMP_loaded_objects;
+
+ /*-----------------------------------------------------------------------*/
+ /* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded*/
+ /* when client asks to load a dynamic executable or library. Note that */
+ /* dependents that have already been loaded with another module will not */
+ /* appear on this queue. */
+ /*-----------------------------------------------------------------------*/
+ dynamic_module_ptr_Stack DLIMP_dependency_stack;
+
+ /*-----------------------------------------------------------------------*/
+ /* Counter for generating unique IDs for file handles. */
+ /* NOTE: File handle is assigned sequencially but is never reclaimed */
+ /* when the modules are unloaded. It is conceivable that a loader*/
+ /* running for a long time and loading and unloading modules */
+ /* could wrap-around. The loader generates error in this case. */
+ /* Presumably each loader instance has a list of file handles, one for */
+ /* each file that it loads, and the file handle serves as an index into */
+ /* the list. Therefore even if the same file is loaded by two loader */
+ /* instances, both loader instances have a different file handle for the */
+ /* file - the file is mapped uniquely to it's appopriate file handle per */
+ /* loader instance. */
+ /*-----------------------------------------------------------------------*/
+ int32_t file_handle;
+
+ /*-----------------------------------------------------------------------*/
+ /* Client token, passed in via DLOAD_create() */
+ /*-----------------------------------------------------------------------*/
+ void * client_handle;
+} LOADER_OBJECT;
+
+
+/*****************************************************************************/
+/* IF data : Below are the data structures used to store init-fini data. */
+/*****************************************************************************/
+typedef struct
+{
+ TARGET_ADDRESS sect_addr;
+ int32_t size;
+}
+IF_single_record;
+
+TYPE_QUEUE_DEFINITION(IF_single_record*, IF_table)
+extern IF_table_Queue TI_init_table;
+
+
+/*****************************************************************************/
+/* Container used to read in argc, argv from the .srgs section. */
+/*****************************************************************************/
+typedef struct { int argc; char *argv[1]; } ARGS_CONTAINER;
+
+
+/*****************************************************************************/
+/* is_DSBT_module() */
+/* */
+/* return true if the module uses DSBT model */
+/*****************************************************************************/
+static inline BOOL is_dsbt_module(DLIMP_Dynamic_Module *dyn_module)
+{
+ return (dyn_module->dsbt_size != 0);
+}
+
+/*****************************************************************************/
+/* is_arm_module() */
+/* */
+/* return true if the module being processed is for ARM */
+/*****************************************************************************/
+static inline BOOL is_arm_module(struct Elf32_Ehdr* fhdr)
+{
+ return fhdr->e_machine == EM_ARM;
+}
+
+/*****************************************************************************/
+/* is_c60_module() */
+/* */
+/* return true if the module being processed is for C60 */
+/*****************************************************************************/
+static inline BOOL is_c60_module(struct Elf32_Ehdr* fhdr)
+{
+ return fhdr->e_machine == EM_TI_C6000;
+}
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_update_dyntag_section_address() */
+/* */
+/* Given the index of a dynamic tag which we happen to know points to a */
+/* section address, find the program header table entry associated with */
+/* the specified address and update the tag value with the real address */
+/* of the section. */
+/* */
+/*---------------------------------------------------------------------------*/
+extern BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i);
+
+extern uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table);
+
+/*---------------------------------------------------------------------------*/
+/* Global flags to help manage internal debug and profiling efforts. */
+/*---------------------------------------------------------------------------*/
+#ifndef __TI_COMPILER_VERSION__
+#define LOADER_DEBUG 1
+#else
+#define LOADER_DEBUG 0
+#endif
+
+#undef LOADER_DEBUG
+
+#define LOADER_DEBUG 1
+#define LOADER_PROFILE 1
+
+#if LOADER_DEBUG
+extern BOOL debugging_on;
+#endif
+
+#if LOADER_DEBUG || LOADER_PROFILE
+extern BOOL profiling_on;
+#endif
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.c b/src/core/dsp/ocl_load/DLOAD/dload_endian.c
new file mode 100644
index 0000000..ac6413b
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.c
@@ -0,0 +1,151 @@
+/*
+* dload_endian.c
+*
+* Simple helper functions to assist core loader with endian-ness issues
+* when the host endian-ness may be opposite the endian-ness of the target.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "dload_endian.h"
+
+/*****************************************************************************/
+/* DLIMP_GET_ENDIAN() - Determine endianness of the host. Uses ELF */
+/* endianness constants. */
+/*****************************************************************************/
+int DLIMP_get_endian()
+{
+ int32_t x = 0x1;
+
+ if (*((int16_t*)(&x))) return ELFDATA2LSB;
+
+ return ELFDATA2MSB;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_ENDIAN32() - Swap endianness of a 32-bit integer. */
+/*****************************************************************************/
+void DLIMP_change_endian32(int32_t* to_change)
+{
+ int32_t temp = 0;
+ temp += (*to_change & 0x000000FF) << 24;
+ temp += (*to_change & 0x0000FF00) << 8;
+ temp += (*to_change & 0x00FF0000) >> 8;
+ temp += (*to_change & 0xFF000000) >> 24;
+ *to_change = temp;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_ENDIAN16() - Swap endianness of a 16-bit integer. */
+/*****************************************************************************/
+void DLIMP_change_endian16(int16_t* to_change)
+{
+ int16_t temp = 0;
+ temp += (*to_change & 0x00FF) << 8;
+ temp += (*to_change & 0xFF00) >> 8;
+ *to_change = temp;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_EHDR_ENDIAN() - Swap endianness of an ELF file header. */
+/*****************************************************************************/
+void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* ehdr)
+{
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_type));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_machine));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_version));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_entry));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_phoff));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_shoff));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_flags));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_ehsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_phentsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_phnum));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shentsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shnum));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shstrndx));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_PHDR_ENDIAN() - Swap endianness of an ELF program header. */
+/*****************************************************************************/
+void DLIMP_change_phdr_endian(struct Elf32_Phdr* phdr)
+{
+ DLIMP_change_endian32((int32_t*)(&phdr->p_type));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_offset));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_vaddr));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_paddr));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_filesz));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_memsz));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_flags));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_align));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_DYNENT_ENDIAN() - Swap endianness of a dynamic table entry. */
+/*****************************************************************************/
+void DLIMP_change_dynent_endian(struct Elf32_Dyn* dyn)
+{
+ DLIMP_change_endian32((int32_t*)(&dyn->d_tag));
+ DLIMP_change_endian32((int32_t*)(&dyn->d_un.d_val));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_SYM_ENDIAN() - Swap endianness of an ELF symbol table entry. */
+/*****************************************************************************/
+void DLIMP_change_sym_endian(struct Elf32_Sym* sym)
+{
+ DLIMP_change_endian32((int32_t*)(&sym->st_name));
+ DLIMP_change_endian32((int32_t*)(&sym->st_value));
+ DLIMP_change_endian32((int32_t*)(&sym->st_size));
+ DLIMP_change_endian16((int16_t*)(&sym->st_shndx));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_RELA_ENDIAN() - Swap endianness of a RELA-type relocation. */
+/*****************************************************************************/
+void DLIMP_change_rela_endian(struct Elf32_Rela* ra)
+{
+ DLIMP_change_endian32((int32_t*)(&ra->r_offset));
+ DLIMP_change_endian32((int32_t*)(&ra->r_info));
+ DLIMP_change_endian32((int32_t*)(&ra->r_addend));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_REL_ENDIAN() - Swap endianness of a REL-type relocation. */
+/*****************************************************************************/
+void DLIMP_change_rel_endian(struct Elf32_Rel* r)
+{
+ DLIMP_change_endian32((int32_t*)(&r->r_offset));
+ DLIMP_change_endian32((int32_t*)(&r->r_info));
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.h b/src/core/dsp/ocl_load/DLOAD/dload_endian.h
new file mode 100644
index 0000000..ee74e11
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.h
@@ -0,0 +1,58 @@
+/*
+* dload_endian.h
+*
+* Specification of functions used to assist loader with endian-ness issues.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_ENDIAN_H
+#define DLOAD_ENDIAN_H
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* Prototypes for ELF file object reader endianness swap routines. */
+/*---------------------------------------------------------------------------*/
+
+int DLIMP_get_endian(void);
+void DLIMP_change_endian32(int32_t* to_change);
+void DLIMP_change_endian16(int16_t* to_change);
+void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* to_change);
+void DLIMP_change_phdr_endian(struct Elf32_Phdr* to_change);
+void DLIMP_change_dynent_endian(struct Elf32_Dyn* to_change);
+void DLIMP_change_sym_endian(struct Elf32_Sym* to_change);
+void DLIMP_change_rela_endian(struct Elf32_Rela* to_change);
+void DLIMP_change_rel_endian(struct Elf32_Rel* to_change);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.c b/src/core/dsp/ocl_load/DLOAD/elf32.c
new file mode 100644
index 0000000..082ba01
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/elf32.c
@@ -0,0 +1,652 @@
+/*
+* elf32.c
+*
+* Basic Data Structures for 32-Bit ELF Object Format Files
+*
+* The data structures in this file come primarily from this specification:
+*
+* Tool Interface Standard (TIS)
+* Executable and Linking Format (ELF) Specification
+* Version 1.2
+*
+* TIS Committee
+* May 1995
+*
+* Additions and enhancements from this specification are also included:
+*
+* System V Application Binary Interface
+* DRAFT 17
+* December 2003
+*
+* http://sco.com/developers/gabi/2003-12-17/contents.html
+*
+* This is a C implementation of the data base objects that are commonly
+* used in the source for TI development tools that support ELF.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Tag Database */
+/*---------------------------------------------------------------------------*/
+
+const struct EDYN_TAG EDYN_TAG_DB[] =
+{
+ /* EDYN_TAG_NULL */
+ {
+ /* d_tag_name */ "DT_NULL",
+ /* d_tag_value */ DT_NULL,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_NEEDED */
+ {
+ /* d_tag_name */ "DT_NEEDED",
+ /* d_tag_value */ DT_NEEDED,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTRELSZ */
+ {
+ /* d_tag_name */ "DT_PLTRELSZ",
+ /* d_tag_value */ DT_PLTRELSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTGOT */
+ {
+ /* d_tag_name */ "DT_PLTGOT",
+ /* d_tag_value */ DT_PLTGOT,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_HASH */
+ {
+ /* d_tag_name */ "DT_HASH",
+ /* d_tag_value */ DT_HASH,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_STRTAB */
+ {
+ /* d_tag_name */ "DT_STRTAB",
+ /* d_tag_value */ DT_STRTAB,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_SYMTAB */
+ {
+ /* d_tag_name */ "DT_SYMTAB",
+ /* d_tag_value */ DT_SYMTAB,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_RELA */
+ {
+ /* d_tag_name */ "DT_RELA",
+ /* d_tag_value */ DT_RELA,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELASZ */
+ {
+ /* d_tag_name */ "DT_RELASZ",
+ /* d_tag_value */ DT_RELASZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELAENT */
+ {
+ /* d_tag_name */ "DT_RELAENT",
+ /* d_tag_value */ DT_RELAENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_STRSZ */
+ {
+ /* d_tag_name */ "DT_STRSZ",
+ /* d_tag_value */ DT_STRSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_SYMENT */
+ {
+ /* d_tag_name */ "DT_SYMENT",
+ /* d_tag_value */ DT_SYMENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_INIT */
+ {
+ /* d_tag_name */ "DT_INIT",
+ /* d_tag_value */ DT_INIT,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI */
+ {
+ /* d_tag_name */ "DT_FINI",
+ /* d_tag_value */ DT_FINI,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_SONAME */
+ {
+ /* d_tag_name */ "DT_SONAME",
+ /* d_tag_value */ DT_SONAME,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_IGNORED,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RPATH */
+ {
+ /* d_tag_name */ "DT_RPATH",
+ /* d_tag_value */ DT_RPATH,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_SYMBOLIC */
+ {
+ /* d_tag_name */ "DT_SYMBOLIC",
+ /* d_tag_value */ DT_SYMBOLIC,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_IGNORED,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_REL */
+ {
+ /* d_tag_name */ "DT_REL",
+ /* d_tag_value */ DT_REL,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELSZ */
+ {
+ /* d_tag_name */ "DT_RELSZ",
+ /* d_tag_value */ DT_RELSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELENT */
+ {
+ /* d_tag_name */ "DT_RELENT",
+ /* d_tag_value */ DT_RELENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTREL */
+ {
+ /* d_tag_name */ "DT_PLTREL",
+ /* d_tag_value */ DT_PLTREL,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_DEBUG */
+ {
+ /* d_tag_name */ "DT_DEBUG",
+ /* d_tag_value */ DT_DEBUG,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_TEXTREL */
+ {
+ /* d_tag_name */ "DT_TEXTREL",
+ /* d_tag_value */ DT_TEXTREL,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_JMPREL */
+ {
+ /* d_tag_name */ "DT_JMPREL",
+ /* d_tag_value */ DT_JMPREL,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_BIND_NOW */
+ {
+ /* d_tag_name */ "DT_BIND_NOW",
+ /* d_tag_value */ DT_BIND_NOW,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_INIT_ARRAY */
+ {
+ /* d_tag_name */ "DT_INIT_ARRAY",
+ /* d_tag_value */ DT_INIT_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI_ARRAY */
+ {
+ /* d_tag_name */ "DT_FINI_ARRAY",
+ /* d_tag_value */ DT_FINI_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_INIT_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_INIT_ARRAYSZ",
+ /* d_tag_value */ DT_INIT_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_FINI_ARRAYSZ",
+ /* d_tag_value */ DT_FINI_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RUNPATH */
+ {
+ /* d_tag_name */ "DT_RUNPATH",
+ /* d_tag_value */ DT_RUNPATH,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FLAGS */
+ {
+ /* d_tag_name */ "DT_FLAGS",
+ /* d_tag_value */ DT_FLAGS,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_ENCODING */
+ {
+ /* d_tag_name */ "DT_ENCODING",
+ /* d_tag_value */ DT_ENCODING,
+ /* d_untype */ EDYN_UNTYPE_UNSPECIFIED,
+ /* d_exec_req */ EDYN_TAGREQ_UNSPECIFIED,
+ /* d_shared_req */ EDYN_TAGREQ_UNSPECIFIED
+ },
+
+ /* EDYN_TAG_PREINIT_ARRAY */
+ {
+ /* d_tag_name */ "DT_PREINIT_ARRAY",
+ /* d_tag_value */ DT_PREINIT_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_PREINIT_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_PREINIT_ARRAYSZ",
+ /* d_tag_value */ DT_PREINIT_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* Terminate array with an id of -1 */
+ {
+ /* d_tag_name */ "",
+ /* d_tag_value */ -1,
+ /* d_untype */ EDYN_UNTYPE_UNSPECIFIED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ }
+};
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Database */
+/*---------------------------------------------------------------------------*/
+const struct ESCN ESCN_DB[] =
+{
+ /* .bss */
+ {
+ /* name */ ESCN_BSS_name,
+ /* sh_type */ SHT_NOBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .comment */
+ {
+ /* name */ ESCN_COMMENT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .data */
+ {
+ /* name */ ESCN_DATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .data1 */
+ {
+ /* name */ ESCN_DATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .debug */
+ {
+ /* name */ ESCN_DEBUG_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .dynamic */
+ {
+ /* name */ ESCN_DYNAMIC_name,
+ /* sh_type */ SHT_DYNAMIC,
+ /* sh_entsize */ sizeof(struct Elf32_Dyn),
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .dynstr */
+ {
+ /* name */ ESCN_DYNSTR_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_ALLOC + SHF_STRINGS
+ },
+
+ /* .dynsym */
+ {
+ /* name */ ESCN_DYNSYM_name,
+ /* sh_type */ SHT_DYNSYM,
+ /* sh_entsize */ sizeof(struct Elf32_Sym),
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .fini */
+ {
+ /* name */ ESCN_FINI_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+
+ /* .fini_array */
+ {
+ /* name */ ESCN_FINI_ARRAY_name,
+ /* sh_type */ SHT_FINI_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .got */
+ {
+ /* name */ ESCN_GOT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .hash */
+ {
+ /* name */ ESCN_HASH_name,
+ /* sh_type */ SHT_HASH,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .init */
+ {
+ /* name */ ESCN_INIT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+
+ /* .init_array */
+ {
+ /* name */ ESCN_INIT_ARRAY_name,
+ /* sh_type */ SHT_INIT_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .interp */
+ {
+ /* name */ ESCN_INTERP_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .line */
+ {
+ /* name */ ESCN_LINE_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .note */
+ {
+ /* name */ ESCN_NOTE_name,
+ /* sh_type */ SHT_NOTE,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .plt */
+ {
+ /* name */ ESCN_PLT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .preinit_array */
+ {
+ /* name */ ESCN_PREINIT_ARRAY_name,
+ /* sh_type */ SHT_PREINIT_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .rel */
+ {
+ /* name */ ESCN_REL_name,
+ /* sh_type */ SHT_REL,
+ /* sh_entsize */ sizeof(struct Elf32_Rel),
+ /* sh_flags */ 0
+ },
+
+ /* .rela */
+ {
+ /* name */ ESCN_RELA_name,
+ /* sh_type */ SHT_RELA,
+ /* sh_entsize */ sizeof(struct Elf32_Rela),
+ /* sh_flags */ 0
+ },
+
+ /* .rodata */
+ {
+ /* name */ ESCN_RODATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .rodata1 */
+ {
+ /* name */ ESCN_RODATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .shstrtab */
+ {
+ /* name */ ESCN_SHSTRTAB_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_STRINGS
+ },
+
+ /* .strtab */
+ {
+ /* name */ ESCN_STRTAB_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_STRINGS
+ },
+
+ /* .symtab */
+ {
+ /* name */ ESCN_SYMTAB_name,
+ /* sh_type */ SHT_SYMTAB,
+ /* sh_entsize */ sizeof(struct Elf32_Sym),
+ /* sh_flags */ 0
+ },
+
+ /* .symtab_shndx */
+ {
+ /* name */ ESCN_SYMTAB_SHNDX_name,
+ /* sh_type */ SHT_SYMTAB_SHNDX,
+ /* sh_entsize */ sizeof(Elf32_Word),
+ /* sh_flags */ 0
+ },
+
+ /* .tbss */
+ {
+ /* name */ ESCN_TBSS_name,
+ /* sh_type */ SHT_NOBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .tdata */
+ {
+ /* name */ ESCN_TDATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .tdata1 */
+ {
+ /* name */ ESCN_TDATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .text */
+ {
+ /* name */ ESCN_TEXT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+#if 0
+ /* .build.attributes */
+ {
+ /* name */ ESCN_ATTRIBUTES_name,
+ /* sh_type */ SHT_ATTRIBUTES,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+#endif
+ /* Terminate array with a NULL name field */
+ {
+ /* name */ (const char*)0,
+ /* sh_type */ 0,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ }
+};
+
diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.h b/src/core/dsp/ocl_load/DLOAD/elf32.h
new file mode 100644
index 0000000..67358d6
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/elf32.h
@@ -0,0 +1,756 @@
+/*
+* elf32.h
+*
+* Basic Data Structures for 32-bit ELF Object Format Files
+*
+* The data structures in this file come primarily from this specification:
+*
+* Tool Interface Standard (TIS)
+* Executable and Linking Format (ELF) Specification
+* Version 1.2
+*
+* TIS Committee
+* May 1995
+*
+* Additions and enhancements from this specification are also included:
+*
+* System V Application Binary Interface
+* DRAFT 17
+* December 2003
+*
+* http://sco.com/developers/gabi/2003-12-17/contents.html
+*
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef ELF32_H
+#define ELF32_H
+
+#include <inttypes.h>
+
+/*---------------------------------------------------------------------------*/
+/* 32-Bit Data Types (Figure 1-2, page 1-2) */
+/*---------------------------------------------------------------------------*/
+typedef uint32_t Elf32_Addr;
+typedef uint16_t Elf32_Half;
+typedef uint32_t Elf32_Off;
+typedef int32_t Elf32_Sword;
+typedef uint32_t Elf32_Word;
+
+
+/*****************************************************************************/
+/* ELF Header */
+/* PP. 1-4 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* ELF Identification Indexes (indexes into Elf32_Ehdr.e_ident[] below) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EI_MAG0 = 0, /* File identification */
+ EI_MAG1 = 1, /* File identification */
+ EI_MAG2 = 2, /* File identification */
+ EI_MAG3 = 3, /* File identification */
+ EI_CLASS = 4, /* File class */
+ EI_DATA = 5, /* Data encoding */
+ EI_VERSION = 6, /* File version */
+ EI_OSABI = 7, /* Operating system / ABI */
+ EI_ABIVERSION = 8, /* ABI version */
+ EI_PAD = 9, /* Start of padding bytes */
+ EI_NIDENT = 16 /* Size of Elf32_Ehdr.e_ident[] */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* ELF Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Ehdr
+{
+ uint8_t e_ident[EI_NIDENT]; /* ELF Magic Number */
+ Elf32_Half e_type; /* Object File Type */
+ Elf32_Half e_machine; /* Target Processor */
+ Elf32_Word e_version; /* Object File Version */
+ Elf32_Addr e_entry; /* Entry Point */
+ Elf32_Off e_phoff; /* Program Header Table Offset */
+ Elf32_Off e_shoff; /* Section Header Table Offset */
+ Elf32_Word e_flags; /* Processor-Specific Flags */
+ Elf32_Half e_ehsize; /* Size of ELF header */
+ Elf32_Half e_phentsize; /* Size of a Program Header */
+ Elf32_Half e_phnum; /* # Entries in Program Header Table */
+ Elf32_Half e_shentsize; /* Size of a Section Header */
+ Elf32_Half e_shnum; /* # Entries in Section Header Table */
+ Elf32_Half e_shstrndx; /* Section Name String Table Section */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Object File Types (value of "e_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ ET_NONE = 0, /* No file type */
+ ET_REL = 1, /* Relocatable file */
+ ET_EXEC = 2, /* Executable file */
+ ET_DYN = 3, /* Shared object file */
+ ET_CORE = 4, /* Core file */
+ ET_LOOS = 0xfe00, /* First OS-specific value */
+ ET_HIPS = 0xfeff, /* Last OS-specific value */
+ ET_LOPROC = 0xff00, /* First processor-specific value */
+ ET_HIPROC = 0xffff /* Last processor-specific value */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Target Processors (value of "e_machine") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EM_NONE = 0, /* No machine */
+ EM_M32 = 1, /* AT&T WE 32100 */
+ EM_SPARC = 2, /* SPARC */
+ EM_386 = 3, /* Intel 80386 */
+ EM_68K = 4, /* Motorola 68000 */
+ EM_88K = 5, /* Motorola 88000 */
+ EM_860 = 7, /* Intel 80860 */
+ EM_MIPS = 8, /* MIPS I Architecture */
+ EM_S370 = 9, /* IBM System/370 Processor */
+ EM_MIPS_RS3_LE = 10, /* MIPS RS3000 Little-endian */
+ EM_PARISC = 15, /* Hewlett-Packard PA-RISC */
+ EM_VPP500 = 17, /* Fujitsu VPP500 */
+ EM_SPARC32PLUS = 18, /* Enhanced instruction set SPARC */
+ EM_960 = 19, /* Intel 80960 */
+ EM_PPC = 20, /* PowerPC */
+ EM_PPC64 = 21, /* 64-bit PowerPC */
+ EM_S390 = 22, /* IBM System/390 Processor */
+ EM_V800 = 36, /* NEC V800 */
+ EM_FR20 = 37, /* Fujitsu FR20 */
+ EM_RH32 = 38, /* TRW RH-32 */
+ EM_RCE = 39, /* Motorola RCE */
+ EM_ARM = 40, /* Advanced RISC Machines ARM */
+ EM_ALPHA = 41, /* Digital Alpha */
+ EM_SH = 42, /* Hitachi SH */
+ EM_SPARCV9 = 43, /* SPARC Version 9 */
+ EM_TRICORE = 44, /* Siemens TriCore embedded processor */
+ EM_ARC = 45, /* "Argonaut RISC Core, Argonaut Technologies Inc. */
+ EM_H8_300 = 46, /* Hitachi H8/300 */
+ EM_H8_300H = 47, /* Hitachi H8/300H */
+ EM_H8S = 48, /* Hitachi H8S */
+ EM_H8_500 = 49, /* Hitachi H8/500 */
+ EM_IA_64 = 50, /* Intel IA-64 processor architecture */
+ EM_MIPS_X = 51, /* Stanford MIPS-X */
+ EM_COLDFIRE = 52, /* Motorola ColdFire */
+ EM_68HC12 = 53, /* Motorola M68HC12 */
+ EM_MMA = 54, /* Fujitsu MMA Multimedia Accelerator */
+ EM_PCP = 55, /* Siemens PCP */
+ EM_NCPU = 56, /* Sony nCPU embedded RISC processor */
+ EM_NDR1 = 57, /* Denso NDR1 microprocessor */
+ EM_STARCORE = 58, /* Motorola Star*Core processor */
+ EM_ME16 = 59, /* Toyota ME16 processor */
+ EM_ST100 = 60, /* STMicroelectronics ST100 processor */
+ EM_TINYJ = 61, /* Advanced Logic Corp. TinyJ embedded processor f */
+ EM_X86_64 = 62, /* AMD x86-64 architecture */
+ EM_PDSP = 63, /* Sony DSP Processor */
+ EM_PDP10 = 64, /* Digital Equipment Corp. PDP-10 */
+ EM_PDP11 = 65, /* Digital Equipment Corp. PDP-11 */
+ EM_FX66 = 66, /* Siemens FX66 microcontroller */
+ EM_ST9PLUS = 67, /* STMicroelectronics ST9+ 8/16 bit microcontrolle */
+ EM_ST7 = 68, /* STMicroelectronics ST7 8-bit microcontroller */
+ EM_68HC16 = 69, /* Motorola MC68HC16 Microcontroller */
+ EM_68HC11 = 70, /* Motorola MC68HC11 Microcontroller */
+ EM_68HC08 = 71, /* Motorola MC68HC08 Microcontroller */
+ EM_68HC05 = 72, /* Motorola MC68HC05 Microcontroller */
+ EM_SVX = 73, /* Silicon Graphics SVx */
+ EM_ST19 = 74, /* STMicroelectronics ST19 8-bit microcontroller */
+ EM_VAX = 75, /* Digital VAX */
+ EM_CRIS = 76, /* Axis Communications 32-bit embedded processor */
+ EM_JAVELIN = 77, /* Infineon Technologies 32-bit embedded processor */
+ EM_FIREPATH = 78, /* Element 14 64-bit DSP Processor */
+ EM_ZSP = 79, /* LSI Logic 16-bit DSP Processor */
+ EM_MMIX = 80, /* Donald Knuth's educational 64-bit processor */
+ EM_HUANY = 81, /* Harvard University machine-independent object f */
+ EM_PRISM = 82, /* SiTera Prism */
+ EM_AVR = 83, /* Atmel AVR 8-bit microcontroller */
+ EM_FR30 = 84, /* Fujitsu FR30 */
+ EM_D10V = 85, /* Mitsubishi D10V */
+ EM_D30V = 86, /* Mitsubishi D30V */
+ EM_V850 = 87, /* NEC v850 */
+ EM_M32R = 88, /* Mitsubishi M32R */
+ EM_MN10300 = 89, /* Matsushita MN10300 */
+ EM_MN10200 = 90, /* Matsushita MN10200 */
+ EM_PJ = 91, /* picoJava */
+ EM_OPENRISC = 92, /* OpenRISC 32-bit embedded processor */
+ EM_ARC_A5 = 93, /* ARC Cores Tangent-A5 */
+ EM_XTENSA = 94, /* Tensilica Xtensa Architecture */
+ EM_VIDEOCORE = 95, /* Alphamosaic VideoCore processor */
+ EM_TMM_GPP = 96, /* Thompson Multimedia General Purpose Processor */
+ EM_NS32K = 97, /* National Semiconductor 32000 series */
+ EM_TPC = 98, /* Tenor Network TPC processor */
+ EM_SNP1K = 99, /* Trebia SNP 1000 processor */
+ EM_ST200 = 100, /* STMicroelectronics (www.st.com) ST200 microcont */
+ EM_IP2K = 101, /* Ubicom IP2xxx microcontroller family */
+ EM_MAX = 102, /* MAX Processor */
+ EM_CR = 103, /* National Semiconductor CompactRISC microprocess */
+ EM_F2MC16 = 104, /* Fujitsu F2MC16 */
+ EM_MSP430 = 105, /* Texas Instruments embedded microcontroller msp4 */
+ EM_BLACKFIN = 106, /* Analog Devices Blackfin (DSP) processor */
+ EM_SE_C33 = 107, /* S1C33 Family of Seiko Epson processors */
+ EM_SEP = 108, /* Sharp embedded microprocessor */
+ EM_ARCA = 109, /* Arca RISC Microprocessor */
+ EM_UNICORE = 110, /* Microprocessor series from PKU-Unity Ltd. and M */
+
+ /*------------------------------------------------------------------------*/
+ /* ELF Magic Numbers Reserved For Texas Instruments */
+ /* */
+ /* The magic numbers 140-159 were reserved through SCO to be included */
+ /* in the official ELF specification. Please see Don Darling */
+ /* regarding any changes or allocation of the numbers below. */
+ /* */
+ /* When we allocate a number for use, SCO needs to be notified so they */
+ /* can update the ELF specification accordingly. */
+ /*------------------------------------------------------------------------*/
+ EM_TI_C6000 = 140, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED02 = 141, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED03 = 142, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED04 = 143, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED05 = 144, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED06 = 145, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED07 = 146, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED08 = 147, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED09 = 148, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED10 = 149, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED11 = 150, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED12 = 151, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED13 = 152, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED14 = 153, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED15 = 154, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED16 = 155, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED17 = 156, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED18 = 157, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED19 = 158, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED20 = 159 /* Reserved for Texas Instruments; unused */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Object File Version (value of "e_version") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EV_NONE = 0, /* Invalid version */
+ EV_CURRENT = 1 /* Current version */
+};
+
+
+/*****************************************************************************/
+/* ELF Identification */
+/* PP. 1-6 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Identification Values for ELF Files */
+/*---------------------------------------------------------------------------*/
+
+/* EI_MAG0 to EI_MAG3 */
+enum
+{
+ ELFMAG0 = 0x7f, /* e_ident[EI_MAG0] */
+ ELFMAG1 = 'E', /* e_ident[EI_MAG1] */
+ ELFMAG2 = 'L', /* e_ident[EI_MAG2] */
+ ELFMAG3 = 'F' /* e_ident[EI_MAG3] */
+};
+
+/* EI_CLASS */
+enum
+{
+ ELFCLASSNONE = 0, /* Invalid class */
+ ELFCLASS32 = 1, /* 32-bit objects */
+ ELFCLASS64 = 2 /* 64-bit objects */
+};
+
+/* EI_DATA */
+enum
+{
+ ELFDATANONE = 0, /* Invalid data encoding */
+ ELFDATA2LSB = 1, /* Little-endian data */
+ ELFDATA2MSB = 2 /* Big-endian data */
+};
+
+/* EI_OSABI */
+enum
+{
+ ELFOSABI_NONE = 0, /* No extensions or unspecified */
+ ELFOSABI_HPUX = 1, /* Hewlett-Packard HP-UX */
+ ELFOSABI_NETBSD = 2, /* NetBSD */
+ ELFOSABI_LINUX = 3, /* Linux */
+ ELFOSABI_SOLARIS = 6, /* Sun Solaris */
+ ELFOSABI_AIX = 7, /* AIX */
+ ELFOSABI_IRIX = 8, /* IRIX */
+ ELFOSABI_FREEBSD = 9, /* FreeBSD */
+ ELFOSABI_TRU64 = 10, /* Compaq TRU64 UNIX */
+ ELFOSABI_MODESTO = 11, /* Novell Modesto */
+ ELFOSABI_OPENBSD = 12, /* Open BSD */
+ ELFOSABI_OPENVMS = 13, /* Open VMS */
+ ELFOSABI_NSK = 14, /* Hewlett-Packard Non-Stop Kernel */
+ ELFOSABI_AROS = 15 /* Amiga Research OS */
+};
+
+/*****************************************************************************/
+/* Program Header */
+/* PP. 2-2 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Program Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Phdr
+{
+ Elf32_Word p_type; /* Segment type */
+ Elf32_Off p_offset; /* Segment file offset */
+ Elf32_Addr p_vaddr; /* Segment virtual address */
+ Elf32_Addr p_paddr; /* Segment physical address */
+ Elf32_Word p_filesz; /* Segment file image size */
+ Elf32_Word p_memsz; /* Segment memory image size */
+ Elf32_Word p_flags; /* Segment flags */
+ Elf32_Word p_align; /* Segment alignment */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Types (value of "p_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PT_NULL = 0, /* Unused table entry */
+ PT_LOAD = 1, /* Loadable segment */
+ PT_DYNAMIC = 2, /* Dynamic linking information */
+ PT_INTERP = 3, /* Interpreter path string location */
+ PT_NOTE = 4, /* Location and size of auxiliary information */
+ PT_SHLIB = 5, /* Shared library information */
+ PT_PHDR = 6, /* Location and size of program header table */
+ PT_TLS = 7, /* Specifies the Thread-Local Storage template */
+ PT_LOOS = 0x60000000, /* First OS-specific value */
+ PT_HIOS = 0x6fffffff, /* Last OS-specific value */
+ PT_LOPROC = 0x70000000, /* First processor-specific value */
+ PT_HIPROC = 0x7fffffff /* Last processor-specific value */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Permissions (value of "p_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PF_X = 0x1, /* Execute */
+ PF_W = 0x2, /* Write */
+ PF_R = 0x4, /* Read */
+ PF_MASKOS = 0x0ff00000, /* OS-specific mask */
+ PF_MASKPROC = 0xf0000000 /* Processor-specific mask */
+};
+
+/*****************************************************************************/
+/* Sections */
+/* PP. 1-9 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Section Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Shdr
+{
+ Elf32_Word sh_name; /* Section name (offset into string section) */
+ Elf32_Word sh_type; /* Section type */
+ Elf32_Word sh_flags; /* Section flags */
+ Elf32_Addr sh_addr; /* Address in memory image */
+ Elf32_Off sh_offset; /* File offset of section data */
+ Elf32_Word sh_size; /* Size of the section in bytes */
+ Elf32_Word sh_link; /* Link to the section header table */
+ Elf32_Word sh_info; /* Extra information depending on section type */
+ Elf32_Word sh_addralign; /* Address alignment constraints */
+ Elf32_Word sh_entsize; /* Size of fixed-size entries in section */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Indexes */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHN_UNDEF = 0, /* Referenced by undefined values */
+ SHN_LORESERVE = 0xff00, /* First reserved index */
+ SHN_LOPROC = 0xff00, /* First processor-specific index */
+ SHN_HIPROC = 0xff1f, /* Last processor-specific index */
+ SHN_LOOS = 0xff20, /* First OS-specific index */
+ SHN_HIOS = 0xff3f, /* Last OS-specific index */
+ SHN_ABS = 0xfff1, /* Referenced by absolute values */
+ SHN_COMMON = 0xfff2, /* Referenced by common values */
+ SHN_XINDEX = 0xffff, /* Indirect index reference (escape value) */
+ SHN_HIRESERVE = 0xffff /* Last reserved index */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Types (value of "sh_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHT_NULL = 0, /* Inactive section */
+ SHT_PROGBITS = 1, /* Application-specific information */
+ SHT_SYMTAB = 2, /* Symbol table */
+ SHT_STRTAB = 3, /* String table */
+ SHT_RELA = 4, /* Relocation entries (explicit addends) */
+ SHT_HASH = 5, /* Symbol hash table */
+ SHT_DYNAMIC = 6, /* Dynamic linking information */
+ SHT_NOTE = 7, /* Miscellaneous information */
+ SHT_NOBITS = 8, /* Contains no data in file */
+ SHT_REL = 9, /* Relocation entries (no expl. addends) */
+ SHT_SHLIB = 10, /* Shared library */
+ SHT_DYNSYM = 11, /* Dynamic symbol table */
+ SHT_INIT_ARRAY = 14, /* Pointers to initialization functions */
+ SHT_FINI_ARRAY = 15, /* Pointers to termination functions */
+ SHT_PREINIT_ARRAY = 16, /* Pointers to pre-init functions */
+ SHT_GROUP = 17, /* Section group */
+ SHT_SYMTAB_SHNDX = 18, /* Section indexes for SHN_XINDEX refs. */
+ SHT_LOOS = 0x60000000, /* First OS-specific type */
+ SHT_HIOS = 0x6fffffff, /* Last OS-specific type */
+ SHT_LOPROC = 0x70000000, /* First processor-specific type */
+ SHT_HIPROC = 0x7fffffff, /* Last processor-specific type */
+ SHT_LOUSER = 0x80000000, /* First application-specific type */
+ SHT_HIUSER = 0xffffffff /* Last application-specific type */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Attribute Flags (value of "sh_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHF_WRITE = 0x1, /* Writable during process execution */
+ SHF_ALLOC = 0x2, /* Loaded into processor memory */
+ SHF_EXECINSTR = 0x4, /* Contains executable instructions */
+ SHF_MERGE = 0x10, /* Can be merged */
+ SHF_STRINGS = 0x20, /* Contains null-terminated strings */
+ SHF_INFO_LINK = 0x40, /* sh_info contains a section index */
+ SHF_LINK_ORDER = 0x80, /* Maintain section ordering */
+ SHF_OS_NONCONFORMING = 0x100, /* OS-specific processing required */
+ SHF_GROUP = 0x200, /* Member of a section group */
+ SHF_TLS = 0x400, /* Contains Thread-Local Storage */
+ SHF_MASKOS = 0x0ff00000, /* Mask of OS-specific flags */
+ SHF_MASKPROC = 0xf0000000 /* Mask for processor-specific flags */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Group Flags */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ GRP_COMDAT = 0x1, /* Common data; only one is kept by linker */
+ GRP_MASKOS = 0x0ff00000, /* Mask for OS-specific group flags */
+ GRP_MASKPROC = 0xf0000000 /* Mask for processor-specific group flags */
+};
+
+
+/*****************************************************************************/
+/* Symbol Table */
+/* PP. 1-18 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Table Entry Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Sym
+{
+ Elf32_Word st_name; /* String table offset for symbol name */
+ Elf32_Addr st_value; /* Symbol value */
+ Elf32_Word st_size; /* Symbol size */
+ uint8_t st_info; /* Symbol type and binding */
+ uint8_t st_other; /* Symbol visibility */
+ Elf32_Half st_shndx; /* Symbol type / defining section */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Undefined Symbol Index */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STN_UNDEF = 0 /* First symbol table entry is always undefined */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Binding and Type Utility Functions. */
+/*---------------------------------------------------------------------------*/
+static inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4); }
+static inline uint8_t ELF32_ST_TYPE(uint8_t i) { return (i & 0xf); }
+static inline uint8_t ELF32_ST_INFO(uint8_t b, uint8_t t)
+ { return ((b << 4) + (t & 0xf)); }
+static inline uint8_t ELF32_ST_VISIBILITY(uint8_t o) { return (o & 0x3); }
+
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Binding (value returned by ELF32_ST_BIND()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STB_LOCAL = 0, /* Symbol does not have external linkage */
+ STB_GLOBAL = 1, /* Symbol has external linkage */
+ STB_WEAK = 2, /* Symbol has weak external linkage */
+ STB_LOOS = 10, /* First OS-specific binding */
+ STB_HIOS = 12, /* Last OS-specific binding */
+ STB_LOPROC = 13, /* First processor-specific binding */
+ STB_HIPROC = 15 /* Last processor-specific binding */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Types (value returned by ELF32_ST_TYPE()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STT_NOTYPE = 0, /* Unspecified type */
+ STT_OBJECT = 1, /* Associated with a data object */
+ STT_FUNC = 2, /* Associated with executable code */
+ STT_SECTION = 3, /* Associated with a section */
+ STT_FILE = 4, /* Associated with a source file */
+ STT_COMMON = 5, /* Labels an uninitialized common block */
+ STT_TLS = 6, /* Specifies a thread-local storage entity */
+ STT_LOOS = 10, /* First OS-specific type */
+ STT_HIOS = 12, /* Last OS-specific type */
+ STT_LOPROC = 13, /* First processor-specific type */
+ STT_HIPROC = 15 /* Last processor-specific type */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Visibility (value returned by ELF32_ST_VISIBILITY()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STV_DEFAULT = 0, /* Visibility specified by binding type */
+ STV_INTERNAL = 1, /* Like STV_HIDDEN, with processor-specific semantics */
+ STV_HIDDEN = 2, /* Not visible to other components */
+ STV_PROTECTED = 3 /* Visible in other components but not preemptable */
+};
+
+/*****************************************************************************/
+/* Relocation */
+/* PP. 1-22 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Relocation Entries Data Structures */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Rel
+{
+ Elf32_Addr r_offset; /* Offset of the relocatable value in the section */
+ Elf32_Word r_info; /* Symbol table index and relocation type */
+};
+
+struct Elf32_Rela
+{
+ Elf32_Addr r_offset; /* Offset of the relocatable value in the section */
+ Elf32_Word r_info; /* Symbol table index and relocation type */
+ Elf32_Sword r_addend; /* Constant addend used to compute new value */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Relocation Symbol and Type Utility Functions. */
+/*---------------------------------------------------------------------------*/
+static inline uint32_t ELF32_R_SYM(uint32_t i) { return (i >> 8); }
+static inline uint8_t ELF32_R_TYPE(uint32_t i) { return (i & 0xFF); }
+static inline uint32_t ELF32_R_INFO(uint32_t s, uint8_t t)
+ { return ((s << 8) + t); }
+
+
+/*****************************************************************************/
+/* Dynamic Section */
+/* PP. 2-8 */
+/*****************************************************************************/
+struct Elf32_Dyn
+{
+ Elf32_Sword d_tag;
+ union
+ {
+ Elf32_Word d_val;
+ Elf32_Addr d_ptr;
+ } d_un;
+};
+
+/* Name Value d_un Executable Shared Obj. */
+/* ---- ----- ---- ---------- ----------- */
+enum
+{
+ DT_NULL = 0, /* ignored mandatory mandatory */
+ DT_NEEDED = 1, /* d_val optional optional */
+ DT_PLTRELSZ = 2, /* d_val optional optional */
+ DT_PLTGOT = 3, /* d_ptr optional optional */
+ DT_HASH = 4, /* d_ptr mandatory mandatory */
+ DT_STRTAB = 5, /* d_ptr mandatory mandatory */
+ DT_SYMTAB = 6, /* d_ptr mandatory mandatory */
+ DT_RELA = 7, /* d_ptr mandatory optional */
+ DT_RELASZ = 8, /* d_val mandatory optional */
+ DT_RELAENT = 9, /* d_val mandatory optional */
+ DT_STRSZ = 10, /* d_val mandatory mandatory */
+ DT_SYMENT = 11, /* d_val mandatory mandatory */
+ DT_INIT = 12, /* d_ptr optional optional */
+ DT_FINI = 13, /* d_ptr optional optional */
+ DT_SONAME = 14, /* d_val ignored optional */
+ DT_RPATH = 15, /* d_val optional ignored */
+ DT_SYMBOLIC = 16, /* ignored ignored optional */
+ DT_REL = 17, /* d_ptr mandatory optional */
+ DT_RELSZ = 18, /* d_val mandatory optional */
+ DT_RELENT = 19, /* d_val mandatory optional */
+ DT_PLTREL = 20, /* d_val optional optional */
+ DT_DEBUG = 21, /* d_ptr optional ignored */
+ DT_TEXTREL = 22, /* ignored optional optional */
+ DT_JMPREL = 23, /* d_ptr optional optional */
+ DT_BIND_NOW = 24, /* ignored optional optional */
+ DT_INIT_ARRAY = 25, /* d_ptr optional optional */
+ DT_FINI_ARRAY = 26, /* d_ptr optional optional */
+ DT_INIT_ARRAYSZ = 27, /* d_val optional optional */
+ DT_FINI_ARRAYSZ = 28, /* d_val optional optional */
+ DT_RUNPATH = 29, /* d_val optional optional */
+ DT_FLAGS = 30, /* d_val optional optional */
+ DT_ENCODING = 32, /* unspecified unspecified unspecified */
+ DT_PREINIT_ARRAY = 32, /* d_ptr optional ignored */
+ DT_PREINIT_ARRAYSZ = 33, /* d_val optional ignored */
+ DT_LOOS = 0x60000000, /* unspecified unspecified unspecified */
+ DT_HIOS = 0x6ffff000, /* unspecified unspecified unspecified */
+ DT_LOPROC = 0x70000000, /* unspecified unspecified unspecified */
+ DT_HIPROC = 0x7fffffff /* unspecified unspecified unspecified */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* DT_FLAGS values. */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ DF_ORIGIN = 0x01, /* loaded object may reference $ORIGIN subst. string */
+ DF_SYMBOLIC = 0x02, /* changes dynamic linker symbol resolution */
+ DF_TEXTREL = 0x04, /* do not allow relocation of non-writable segments */
+ DF_BIND_NOW = 0x08, /* don't use lazy binding */
+ DF_STATIC_TLS = 0x10, /* do not load this file dynamically */
+ DF_DIRECT_DEPENDENT = 0x20, /* limit global sym lookup to dependent list */
+ DF_WORLD = 0x40 /* Linux style global sym lookup, breadth-first */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Tag Database. */
+/*---------------------------------------------------------------------------*/
+
+/* Specifiers for which d_un union member to use */
+
+enum
+{
+ EDYN_UNTYPE_IGNORED,
+ EDYN_UNTYPE_VAL,
+ EDYN_UNTYPE_PTR,
+ EDYN_UNTYPE_UNSPECIFIED
+};
+
+
+/* Specifiers for executable/shared object file requirements */
+
+enum
+{
+ EDYN_TAGREQ_IGNORED,
+ EDYN_TAGREQ_MANDATORY,
+ EDYN_TAGREQ_OPTIONAL,
+ EDYN_TAGREQ_UNSPECIFIED
+};
+
+
+/* Data structure for one dynamic tag database entry */
+
+struct EDYN_TAG
+{
+ const char* d_tag_name; /* tag name string */
+ Elf32_Sword d_tag_value; /* DT_* tag value */
+ Elf32_Word d_untype; /* which d_un union member to use */
+ Elf32_Word d_exec_req; /* requirement for executable files */
+ Elf32_Word d_shared_req; /* requirement for shared object files */
+};
+
+extern const struct EDYN_TAG EDYN_TAG_DB[];
+
+/*****************************************************************************/
+/* Special Section Database */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Names */
+/*---------------------------------------------------------------------------*/
+#define ESCN_BSS_name ".bss"
+#define ESCN_COMMENT_name ".comment"
+#define ESCN_DATA1_name ".data1"
+#define ESCN_DATA_name ".data"
+#define ESCN_DEBUG_name ".debug"
+#define ESCN_DYNAMIC_name ".dynamic"
+#define ESCN_DYNSTR_name ".dynstr"
+#define ESCN_DYNSYM_name ".dynsym"
+#define ESCN_FINI_ARRAY_name ".fini_array"
+#define ESCN_FINI_name ".fini"
+#define ESCN_GOT_name ".got"
+#define ESCN_HASH_name ".hash"
+#define ESCN_INIT_ARRAY_name ".init_array"
+#define ESCN_INIT_name ".init"
+#define ESCN_INTERP_name ".interp"
+#define ESCN_LINE_name ".line"
+#define ESCN_NOTE_name ".note"
+#define ESCN_PLT_name ".plt"
+#define ESCN_PREINIT_ARRAY_name ".preinit_array"
+#define ESCN_RELA_name ".rela"
+#define ESCN_REL_name ".rel"
+#define ESCN_RODATA1_name ".rodata1"
+#define ESCN_RODATA_name ".rodata"
+#define ESCN_SHSTRTAB_name ".shstrtab"
+#define ESCN_STRTAB_name ".strtab"
+#define ESCN_SYMTAB_SHNDX_name ".symtab_shndx"
+#define ESCN_SYMTAB_name ".symtab"
+#define ESCN_TBSS_name ".tbss"
+#define ESCN_TDATA1_name ".tdata1"
+#define ESCN_TDATA_name ".tdata"
+#define ESCN_TEXT_name ".text"
+#define ESCN_ATTRIBUTES_name "__TI_build_attributes"
+#define ESCN_ICODE_name "__TI_ICODE"
+#define ESCN_XREF_name "__TI_XREF"
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Information Data Structure. */
+/*---------------------------------------------------------------------------*/
+struct ESCN
+{
+ const char *name;
+ Elf32_Word sh_type;
+ Elf32_Word sh_entsize;
+ Elf32_Word sh_flags;
+};
+
+extern const struct ESCN ESCN_DB[];
+
+#endif /* ELF32_H */
diff --git a/src/core/dsp/ocl_load/DLOAD/relocate.h b/src/core/dsp/ocl_load/DLOAD/relocate.h
new file mode 100644
index 0000000..ee21aa9
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/relocate.h
@@ -0,0 +1,64 @@
+/*
+* relocate.h
+*
+* Declare names and IDs of all C6x-specific relocation types supported
+* in the dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef RELOCATE_H
+#define RELOCATE_H
+
+#include <inttypes.h>
+#include "elf32.h"
+#include "dload.h"
+#include "dload_api.h"
+
+/*---------------------------------------------------------------------------*/
+/* Declare some globals that are used for internal debugging and profiling. */
+/*---------------------------------------------------------------------------*/
+#if LOADER_DEBUG || LOADER_PROFILE
+#include <time.h>
+extern int DLREL_relocations;
+extern time_t DLREL_total_reloc_time;
+#endif
+
+
+/*---------------------------------------------------------------------------*/
+/* Landing point for core loader's relocation processor. */
+/*---------------------------------------------------------------------------*/
+void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/symtab.h b/src/core/dsp/ocl_load/DLOAD/symtab.h
new file mode 100644
index 0000000..1f06584
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/symtab.h
@@ -0,0 +1,72 @@
+/*
+* symtab.h
+*
+* Specification of functions used by the core loader to create, maintain,
+* and destroy internal symbol tables.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef SYMTAB_H
+#define SYMTAB_H
+
+#include "ArrayList.h"
+#include "dload.h"
+
+/*****************************************************************************/
+/* This is the top-level application file handle. It should only be needed */
+/* under the Linux and DSBT models. */
+/*****************************************************************************/
+extern int32_t DLIMP_application_handle;
+
+/*---------------------------------------------------------------------------*/
+/* Core Loader Symbol Table Management Functions */
+/*---------------------------------------------------------------------------*/
+BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle,
+ int32_t sym_index,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Addr *sym_value);
+
+BOOL DLSYM_global_lookup(DLOAD_HANDLE handle,
+ const char *sym_name,
+ DLIMP_Loaded_Module *pentry,
+ Elf32_Addr *sym_value);
+
+BOOL DLSYM_lookup_local_symtab(const char *sym_name,
+ struct Elf32_Sym *symtab,
+ Elf32_Word symnum,
+ Elf32_Addr *sym_value);
+
+void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/util.h b/src/core/dsp/ocl_load/DLOAD/util.h
new file mode 100644
index 0000000..24c5b3f
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/util.h
@@ -0,0 +1,89 @@
+/*
+* util.h
+*
+* Definition of some useful string comparison routines (not
+* not provided on all platforms) and a few generic macros.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <ctype.h>
+
+#if !defined(__linux)
+
+/*****************************************************************************/
+/* STRCASECMP() - Case-insensitive strcmp. */
+/*****************************************************************************/
+static int strcasecmp(const char* s1, const char* s2)
+{
+ char c1, c2;
+ do { c1 = *s1++; c2 = *s2++; }
+ while (c1 && c2 && (tolower(c1) == tolower(c2)));
+
+ return tolower(c1) - tolower(c2);
+}
+
+/*****************************************************************************/
+/* STRNCASECMP() - Case-insensitive strncmp. */
+/*****************************************************************************/
+static int strncasecmp(const char* s1, const char* s2, size_t n)
+{
+ char c1, c2;
+
+ if (!n) return 0;
+
+ do { c1 = *s1++; c2 = *s2++; }
+ while (--n && c1 && c2 && (tolower(c1) == tolower(c2)));
+
+ return tolower(c1) - tolower(c2);
+}
+
+#endif
+
+/*****************************************************************************/
+/* Define MIN and MAX macros. */
+/*****************************************************************************/
+#define MIN(x,y) (((x) > (y)) ? (y) : (x))
+#define MAX(x,y) (((x) >= (y)) ? (x) : (y))
+
+/*****************************************************************************/
+/* C implementation of 'bool' type. */
+/*****************************************************************************/
+typedef int BOOL;
+#define TRUE 1
+#define FALSE 0
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/version.h b/src/core/dsp/ocl_load/DLOAD/version.h
new file mode 100644
index 0000000..e36d1a9
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/version.h
@@ -0,0 +1,63 @@
+/*
+* version.h
+*
+* Dynamic Loader source version identifictaion.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef _VERSION_H_
+#define _VERSION_H_
+
+/*****************************************************************************/
+/* VERSION NUMBER COMPONENTS - ALWAYS INCREASING!! */
+/* Initial version ID is 1.0.0. Successive version ID's will be incremented */
+/* by automated processes during release port. */
+/*****************************************************************************/
+#define VERSION_MAJOR 1
+#define VERSION_MINOR 0
+#define VERSION_PATCH 0
+
+/******************************************************************************/
+/* Macros used to convert version macros into strings. */
+/******************************************************************************/
+#define MKCSTR(_str) #_str
+#define MKMSTR(_str) MKCSTR(_str)
+
+/******************************************************************************/
+/* VERSION string construction macros. */
+/******************************************************************************/
+#define VERSTR MKMSTR(VERSION_MAJOR) "." MKMSTR(VERSION_MINOR) "." MKMSTR(VERSION_PATCH)
+#define VERSION "Texas Instruments Dynamic Loader API/Core v"VERSTR
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/virtual_targets.h b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h
new file mode 100644
index 0000000..1d44b4d
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "dload.h"
+#include "elf32.h"
+
+#ifdef C60_TARGET
+#include "c60_dynamic.h"
+#include "c60_reloc.h"
+#endif
+
+#ifdef ARM_TARGET
+#include "arm_dynamic.h"
+#include "arm_reloc.h"
+#endif
+
+/*****************************************************************************/
+/* Define a virtual target class to give access to target specific functions */
+/*****************************************************************************/
+typedef struct vtarget
+{
+ int machine_id;
+
+ BOOL (*relocate_dynamic_tag_info)(DLIMP_Dynamic_Module *dyn_module, int i);
+ BOOL (*process_eiosabi)(DLIMP_Dynamic_Module* dyn_module);
+ BOOL (*process_dynamic_tag)(DLIMP_Dynamic_Module *dyn_module, int i);
+ void (*relocate)(DLOAD_HANDLE handle, LOADER_FILE_DESC *elf_file,
+ DLIMP_Dynamic_Module *dyn_module);
+
+} VIRTUAL_TARGET;
+
+
+
+/*****************************************************************************/
+/* Populate this for each target supported. */
+/*****************************************************************************/
+VIRTUAL_TARGET vt_arr[] = {
+
+#ifdef C60_TARGET
+ {
+ EM_TI_C6000,
+ DLDYN_c60_relocate_dynamic_tag_info,
+ DLDYN_c60_process_eiosabi,
+ DLDYN_c60_process_dynamic_tag,
+ DLREL_c60_relocate
+ },
+#endif
+#ifdef ARM_TARGET
+ {
+ EM_ARM,
+ DLDYN_arm_relocate_dynamic_tag_info,
+ DLDYN_arm_process_eiosabi,
+ DLDYN_arm_process_dynamic_tag,
+ DLREL_arm_relocate
+ },
+#endif
+ {
+ EM_NONE,
+ 0,
+ 0,
+ 0,
+ 0
+ }
+};
+
+
diff --git a/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log
new file mode 100644
index 0000000..689cfe6
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log
@@ -0,0 +1,33 @@
+
+ Dynamic Loader API and Loader Core - Version Number Change Log
+ ==============================================================
+
+ Version Number Description
+ --------------------------
+
+ The version number associated with the Dynamic Loader API and the Loader Core
+ sources has three components:
+
+ <major version>.<minor version>.<patch version>
+
+ major version - is incremented if there is a change to the API that creates a
+ compatibility discontinuity.
+
+ minor version - is incremented if functionality is added to the API without
+ causing a compatibility discontinuity.
+
+ patch version - is incremented if a defect has been repaired, a performance
+ enhancement has been added, or the source code has been
+ refactored in some way. There should not be a compatibility
+ discontinuity created by an increment to the patch version.
+
+ Version Number Change Log
+ -------------------------
+
+ 1.0.0 - 17 July 2009 - Initial release of dynamic loader API and loader
+ core sources.
+
+ 2.0.0 - 1 Feb 2013 - Add client handle to several DLIF functions.
+ - Add DLIF_exit() for loader abort.
+
+
diff --git a/src/core/dsp/ocl_load/DLOAD_API/dload_api.h b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h
new file mode 100644
index 0000000..95de10f
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h
@@ -0,0 +1,700 @@
+/*
+* dload_api.h
+*
+* Dynamic Loader API Specification
+* --------------------------------
+*
+* Client-side of API is assumed to be platform dependent, but object file
+* format independent.
+*
+* Core Loader side of API is assumed to be platform independent, but
+* object file format dependent and target dependent.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_API_H
+#define DLOAD_API_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include "util.h"
+
+extern int debugging_on;
+
+/*****************************************************************************/
+/* Specification of Loader File Descriptor. If client side of the loader */
+/* supports virtual memory, this may need to be updated to facilitate the */
+/* use of mmap(). */
+/*****************************************************************************/
+typedef FILE LOADER_FILE_DESC;
+
+static const int LOADER_SEEK_SET = SEEK_SET;
+static const int LOADER_SEEK_CUR = SEEK_CUR;
+static const int LOADER_SEEK_END = SEEK_END;
+
+/*****************************************************************************/
+/* TARGET_ADDRESS - type suitable for storing target memory address values. */
+/*****************************************************************************/
+typedef uint32_t TARGET_ADDRESS;
+
+/*****************************************************************************/
+/* Define DLOAD Object Handle */
+/*****************************************************************************/
+typedef void * DLOAD_HANDLE;
+
+/*****************************************************************************/
+/* Core Loader Provided API Functions (Core Loader Entry Points) */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_version() */
+/* */
+/* Return a string constant representation for the version ID of the */
+/* dynamic loader's core loader source code. */
+/* */
+/*---------------------------------------------------------------------------*/
+#include "version.h"
+#define DLOAD_version() VERSION
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_create() */
+/* */
+/* Construct and initialize the dynamic loader core's handle. */
+/* */
+/*---------------------------------------------------------------------------*/
+DLOAD_HANDLE DLOAD_create(void * client_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_destroy() */
+/* */
+/* Destroy and finalize the dynamic loader core's handle. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_destroy(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_initialize() */
+/* */
+/* Construct and initialize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_initialize(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_finalize() */
+/* */
+/* Destroy and finalize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_finalize(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load_symbols() */
+/* */
+/* Load externally visible symbols from the specified file so that they */
+/* can be linked against when another object file is subsequntly loaded. */
+/* External symbols will be made available for global symbol linkage. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load() */
+/* */
+/* Dynamically load the specified file and return a file handle for the */
+/* loaded file. If the load fails, this function will return a value */
+/* zero (0). */
+/* */
+/* The core loader must have read access to the file pointed by fp. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_unload() */
+/* */
+/* Given a file handle ID, unload all object segments associated with */
+/* the identified file and any of its dependents that are not still in */
+/* use. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t pseudopid);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_names_info() */
+/* */
+/* Given a file handle, return the number entry points that are */
+/* available in the specified file as well as the max name length. This */
+/* can then be used by the client to allocate the appropriate amount of */
+/* memory needed to call DLOAD_get_entry_names() */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_names_info(DLOAD_HANDLE handle, uint32_t file_handle,
+ int32_t *entry_pt_cnt,
+ int32_t *entry_pt_max_name_len);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_names() */
+/* */
+/* Given a file handle, build a list of entry point names that are */
+/* available in the specified file. This can be used when querying */
+/* the list of global functions available in a shared library. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle, uint32_t file_handle,
+ int32_t* entry_pt_cnt, char*** entry_pt_names);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_query_symbol() */
+/* */
+/* Query the value of a symbol that is defined by an object file that */
+/* has previously been loaded. Boolean return value will be false if */
+/* the symbol is not found. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_query_symbol(DLOAD_HANDLE handle, uint32_t file_handle,
+ const char *sym_name, TARGET_ADDRESS *sym_val);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_point() */
+/* */
+/* Given a file handle, return the entry point target address associated */
+/* with that object file. The entry point address value is written to */
+/* *sym_val. The return value of the function indicates whether the */
+/* file with the specified handle was found or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load_arguments() */
+/* */
+/* Given a file handle, find the object file assicated with that handle */
+/* and copy the argc/argv information from the client into that object */
+/* file's .args section. The return value indicates whether the operation */
+/* was successful. If there are no loaded object files which match the */
+/* handle or if there is insufficient space in the .args section to hold */
+/* the specified argc/argv information, the function will return false. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle,
+ int argc, char** argv);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_prepare_for_execution() */
+/* */
+/* Given a file handle, prepare for execution : */
+/* - Return entry point associated with that module in the *sym_val */
+/* output parameter. */
+/* - Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* - As a test (for the Reference implementation) read the arguments */
+/* using the DLIF_read_arguments() function and set global argc,argv. */
+/* */
+/* The return value of the function indicates whether the file with the */
+/* specified handle was found or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val,
+ int argc, char** argv);
+
+
+/*****************************************************************************/
+/* Client Provided API Functions */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* File I/O */
+/* */
+/* The client side of the dynamic loader must provide basic file I/O */
+/* capabilities so that the core loader has random access into any */
+/* object file that it is asked to load. */
+/* */
+/* The client side of the dynamic loader must provide a definition of */
+/* the LOADER_FILE_DESC in dload_filedefs.h. This allows the core loader */
+/* to be independent of how the client accesses raw data in an object */
+/* file. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fseek() */
+/* */
+/* Seek to a position in a file (accessed via 'stream') based on the */
+/* values for offset and origin. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_ftell() */
+/* */
+/* Return the current file position in the file identified in the */
+/* LOADER_FILE_DESC pointed to by 'stream'. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_ftell(LOADER_FILE_DESC *stream);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fread() */
+/* */
+/* Read 'size' * 'nmemb' bytes of data from the file identified in the */
+/* LOADER_FILE_DESC object pointed to by 'stream', and write that data */
+/* into the memory accessed via 'ptr'. */
+/* */
+/*---------------------------------------------------------------------------*/
+size_t DLIF_fread(void *ptr, size_t size, size_t nmemb,
+ LOADER_FILE_DESC *stream);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fclose() */
+/* */
+/* Close a file that was opened on behalf of the core loader. Ownership */
+/* of the file pointer in question belongs to the core loader, but the */
+/* client has exclusive access to the file system. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_fclose(LOADER_FILE_DESC *fd);
+
+/*---------------------------------------------------------------------------*/
+/* Host Memory Management */
+/* */
+/* Allocate and free host memory as needed for the dynamic loader's */
+/* internal data structures. If the dynamic loader resides on the */
+/* target architecture, then this memory is allocated from a target */
+/* memory heap that must be managed separately from memory that is */
+/* allocated for a dynamically loaded object file. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_malloc() */
+/* */
+/* Allocate 'size' bytes of memory space that is usable as scratch space */
+/* (appropriate for the loader's internal data structures) by the dynamic */
+/* loader. */
+/* */
+/* If allocation fails, this function must not return. */
+/* */
+/*---------------------------------------------------------------------------*/
+void* DLIF_malloc(size_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_free() */
+/* */
+/* Free memory space that was previously allocated by DLIF_malloc(). */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_free(void* ptr);
+
+/*---------------------------------------------------------------------------*/
+/* Target Memory Allocator Interface */
+/* */
+/* The client side of the dynamic loader must create and maintain an */
+/* infrastructure to manage target memory. The client must keep track */
+/* of what target memory is associated with each object segment, */
+/* allocating target memory for newly loaded objects and release target */
+/* memory that is associated with objects that are being unloaded from */
+/* the target architecture. */
+/* */
+/* The two client-supplied functions, DLIF_allocate() and DLIF_release(), */
+/* are used by the core loader to interface into the client side's */
+/* target memory allocator infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_SEGMENT_FLAGS - segment characteristics. */
+/*---------------------------------------------------------------------------*/
+typedef uint32_t DLOAD_SEGMENT_FLAGS;
+static const int DLOAD_SF_executable = 0x1; /* Memory must be executable */
+static const int DLOAD_SF_relocatable = 0x2; /* Segment must be relocatable */
+static const int DLOAD_SF_writable = 0x4; /* Memory must be writable */
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_MEMORY_SEGMENT - Define structure to represent placement and size */
+/* details of a segment to be loaded. */
+/*---------------------------------------------------------------------------*/
+struct DLOAD_MEMORY_SEGMENT
+{
+ uint32_t target_page; /* requested/returned memory page */
+ TARGET_ADDRESS target_address; /* requested/returned address */
+ uint32_t objsz_in_bytes; /* size of init'd part of segment */
+ uint32_t memsz_in_bytes; /* size of memory block for segment */
+// DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */
+};
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_MEMORY_REQUEST - Define structure to represent a target memory */
+/* request made by the core loader on behalf of a segment that the */
+/* loader needs to relocate and write into target memory. */
+/*---------------------------------------------------------------------------*/
+struct DLOAD_MEMORY_REQUEST
+{
+ LOADER_FILE_DESC *fp; /* file being loaded */
+ struct DLOAD_MEMORY_SEGMENT *segment; /* obj for req/ret alloc */
+ void *host_address; /* ret hst ptr from DLIF_copy()*/
+ BOOL is_loaded; /* returned as true if segment */
+ /* is already in target memory */
+ uint32_t offset; /* file offset of segment's */
+ /* raw data */
+ uint32_t flip_endian; /* endianness of trg opp host */
+ DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */
+ uint32_t align; /* align of trg memory block */
+};
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_initMem() */
+/* */
+/* Given an address and size, initialize the memory used to load the */
+/* dynamic segments. This should be called by the client before */
+/* beginning dynamic loading. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_initMem(void* client_handle, uint32_t dynMemAddr, uint32_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_deinitMem() */
+/* */
+/* De-initialize the memory used to load the dynamic segments. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_deinitMem(void* client_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_allocate() */
+/* */
+/* Given a DLOAD_MEMORY_REQUEST created by the core loader, allocate */
+/* target memory to fulfill the request using the target memory */
+/* management infrastrucutre on the client side of the dynamic loader. */
+/* The contents of the DLOAD_MEMORY_REQUEST will be updated per the */
+/* details of a successful allocation. The allocated page and address */
+/* can be found in the DLOAD_MEMORY_SEGMENT attached to the request. */
+/* The boolean return value reflects whether the allocation was */
+/* successful or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_release() */
+/* */
+/* Given a DLOAD_MEMORY_SEGMENT description, free the target memory */
+/* associated with the segment using the target memory management */
+/* infrastructure on the client side of the dynamic loader. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr);
+
+/*---------------------------------------------------------------------------*/
+/* Target Memory Access / Write Services */
+/* */
+/* The client side's target memory allocator infrastructure communicates */
+/* with the core loader through the DLOAD_MEMORY_REQUEST and */
+/* DLOAD_MEMORY_SEGMENT data structures defined above. To complete the */
+/* loading of an object segment, the segment may need to be relocated */
+/* before it is actually written to target memory in the space that was */
+/* allocated for it by DLIF_allocate(). */
+/* */
+/* The client side of the dynamic loader provides two functions to help */
+/* complete the process of loading an object segment, DLIF_copy() and */
+/* DLIF_write(). */
+/* */
+/* These functions help to make the core loader truly independent of */
+/* whether it is running on the host or target architecture and how the */
+/* client provides for reading/writing from/to target memory. */
+/* */
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/* DLIF_copy() */
+/* */
+/* Copy segment data from the object file described in the 'fp' and */
+/* 'offset' of the DLOAD_MEMORY_REQUEST into host accessible memory so */
+/* that it can relocated or otherwise manipulated by the core loader. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_write() */
+/* */
+/* Once the segment data described in the DLOAD_MEMORY_REQUEST is ready */
+/* (relocated, if needed), write the segment contents to the target */
+/* memory identified in the DLOAD_MEMORY_SEGMENT attached to the request. */
+/* */
+/* After the segment contents have been written to target memory, the */
+/* core loader should discard the DLOAD_MEMORY_REQUEST object, but retain */
+/* the DLOAD_MEMORY_SEGMENT object so that the target memory associated */
+/* with the segment can be releases when the segment is unloaded. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_read() */
+/* */
+/* Given a host accessible buffer, read content of indicated target */
+/* memory address into the buffer. */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_read(void* client_handle,
+ void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_memcpy() */
+/* */
+/* Given a host accessible buffer, copy content from specified buffer */
+/* into target memory. */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_memcpy(void* client_handle, void *to, void *from, size_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_execute() */
+/* */
+/* Start execution on the target architecture from given 'exec_addr'. */
+/* If the dynamic loader is running on the target architecture, this can */
+/* be effected as a simple function call. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_execute(void* client_handle, TARGET_ADDRESS exec_addr);
+
+/*---------------------------------------------------------------------------*/
+/* Loading and Unloading of Dependent Files */
+/* */
+/* The dynamic loader core loader must coordinate loading and unloading */
+/* dependent object files with the client side of the dynamic loader. */
+/* This allows the client to keep its bookkeeping information up to date */
+/* with what is currently loaded on the target architecture. */
+/* */
+/* For instance, the client may need to interact with a file system or */
+/* registry. The client may also need to update debug information in */
+/* synch with the loading and unloading of shared objects. */
+/* */
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/* DLIF_load_dependent() */
+/* */
+/* Ask client to find and open a dependent file identified by the */
+/* 'so_name' parameter, then, if necessary, initiate a DLOAD_load() */
+/* call to actually load the shared object onto the target. A */
+/* successful load will return a file handle ID that the client can */
+/* associate with the newly loaded file. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_load_dependent(void* client_handle, const char* so_name);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_unload_dependent() */
+/* */
+/* Ask client to unload a dependent file identified by the 'file_handle' */
+/* parameter. Initiate a call to DLOAD_unload() to actually free up */
+/* the target memory that was occupied by the object file. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_unload_dependent(void* client_handle, uint32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* Error/Warning Registration Functions */
+/* */
+/* The client will maintain an error/warning log. This will allow the */
+/* core loader to register errors and warnings in the load during a */
+/* given dynamic load. The client is required to check the log after */
+/* each load attempt to report any problems. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+
+/*---------------------------------------------------------------------------*/
+/* Loader Warning Types */
+/*---------------------------------------------------------------------------*/
+typedef enum {
+ DLWT_MISC = 0, /* Miscellaneous warning */
+ DLWT_FILE /* Warning missing/invalid file information */
+} LOADER_WARNING_TYPE;
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_warning() */
+/* */
+/* Log a warning message with the client's error/warning handling */
+/* infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* Loader Error Types */
+/*---------------------------------------------------------------------------*/
+typedef enum {
+ DLET_MISC = 0, /* Miscellaneous error */
+ DLET_FILE, /* Error reading/processing file */
+ DLET_SYMBOL, /* Symbol resolution error */
+ DLET_RELOC, /* Relocation error */
+ DLET_MEMORY, /* Host memory allocation/free error */
+ DLET_TRGMEM, /* Target memory allocation/free error */
+ DLET_DEBUG /* Shared object or DLL debug error */
+} LOADER_ERROR_TYPE;
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_error() */
+/* */
+/* Log an error message with the client's error/warning handling */
+/* infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_exit() */
+/* */
+/* Abort the loader following a fatal error. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_exit(int code);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_trace() */
+/* */
+/* Log a message with the client's trace handling infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_trace(const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Static Base Table (DSBT) Support Functions */
+/*---------------------------------------------------------------------------*/
+#define DSBT_INDEX_INVALID -1
+#define DSBT_DSBT_BASE_INVALID 0
+#define DSBT_STATIC_BASE_INVALID 0
+
+/*****************************************************************************/
+/* Core Loader Side of DSBT Support */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_dsbt_size() */
+/* */
+/* Query the size of the DSBT associated with a specified file. The */
+/* client will check the size of a module's DSBT before it writes a copy */
+/* of the master DSBT to the module's DSBT. If the module's DSBT is not */
+/* big enough, an error will be emitted and the load will fail. */
+/* */
+/*---------------------------------------------------------------------------*/
+uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_dsbt_base() */
+/* */
+/* Find DSBT address for specified file. The client will query for this */
+/* address after allocation and symbol relocation has been completed. */
+/* The client will write a copy of the master DSBT to the returned DSBT */
+/* address if the module's DSBT size is big enough. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *dsbt_base);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_static_base() */
+/* */
+/* Find static base for a specified file. The client will query for this */
+/* address after allocation and symbol relocation has been completed. */
+/* The client will use the returned static base value to fill the slot */
+/* in the master DSBT that is associated with this module. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *static_base);
+
+
+/*****************************************************************************/
+/* Client Side of DSBT Support */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_register_dsbt_index_request() */
+/* */
+/* Register a request for a DSBT index with the client. A module can */
+/* make a specific DSBT index request or it can allow the client to */
+/* assign a DSBT index on its behalf (requested_dsbt_index == -1). The */
+/* client implementation of this function must check that a specific DSBT */
+/* index request does not conflict with a previous specific DSBT index */
+/* request. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle,
+ const char *requestor_name,
+ int32_t requestor_file_handle,
+ int32_t requested_dsbt_index);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_assign_dsbt_indices() */
+/* */
+/* Bind each module that registered a request for a DSBT index to a */
+/* specific slot in the DSBT. Specific requests for DSBT indices will be */
+/* honored first. Any general requests that remain will be assigned to */
+/* the first available slot in the DSBT. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_assign_dsbt_indices(void);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_get_dsbt_index() */
+/* */
+/* Given a module that uses the DSBT model, return the identity of the */
+/* DSBT slot that was assigned to it by the client. This function can */
+/* only be called after the client has assigned DSBT indices to all */
+/* loaded object modules that use the DSBT model. The implementation of */
+/* this function will check that a proper DSBT index has been assigned to */
+/* the specified module and an invalid index (-1) if there is a problem. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_get_dsbt_index(int32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_update_all_dsbts() */
+/* */
+/* Populate the client's model of the master DSBT with the static base */
+/* for each assigned slot in the DSBT, then write a copy of the master */
+/* DSBT to each module's DSBT location. The implementation of this */
+/* function must check the size of each module's DSBT to make sure that */
+/* it is large enough to hold a copy of the master DSBT. The function */
+/* will return FALSE if there is a problem. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_update_all_dsbts(void);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c
new file mode 100644
index 0000000..fbcdbeb
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c
@@ -0,0 +1,417 @@
+/*
+* symtab.c
+*
+* Symbol table creation, maintenance, and management. This module also
+* contains implementations of local and global symbol table lookup
+* algorithms, as appropriate for the platform that we are running on
+* (assumed to be DSP Bridge or Linux model, indicated by
+* direct_dependent_only flag in a given Module).
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "elf32.h"
+#include "ArrayList.h"
+
+/*---------------------------------------------------------------------------*/
+/* Set up a Queue of Int32 type data objects. */
+/*---------------------------------------------------------------------------*/
+#include "Queue.h"
+TYPE_QUEUE_DEFINITION(int32_t, Int32)
+TYPE_QUEUE_IMPLEMENTATION(int32_t, Int32)
+
+#include "symtab.h"
+#include "dload_api.h"
+#include <string.h>
+
+/*---------------------------------------------------------------------------*/
+/* Holds the handle of the ET_EXEC-type mmodule loaded, if any. */
+/*---------------------------------------------------------------------------*/
+int32_t DLIMP_application_handle = 0;
+
+/*---------------------------------------------------------------------------*/
+/* Function prototypes */
+/*---------------------------------------------------------------------------*/
+BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value);
+
+/*****************************************************************************/
+/* DLSYM_COPY_GLOBALS() - Copy global symbols from the dynamic module's */
+/* symbol table to the loader's global symbol table. */
+/*****************************************************************************/
+void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module)
+{
+ Elf32_Word i, global_index, global_symnum;
+ DLIMP_Loaded_Module *module = dyn_module->loaded_module;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_copy_globals:\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* The dynamic symbol table is sorted so that the local symbols come */
+ /* before the global symbols. gsymtab_offset points to the address where */
+ /* the first global symbol starts. Only the global symbols need to be */
+ /* copied into the persistent info. */
+ /*------------------------------------------------------------------------*/
+ global_index = dyn_module->gsymtab_offset / sizeof(struct Elf32_Sym);
+ global_symnum = dyn_module->symnum - global_index;
+
+ /*------------------------------------------------------------------------*/
+ /* Create space for the new global symbol table. */
+ /*------------------------------------------------------------------------*/
+ if (module->gsymtab)
+ {
+ DLIF_free(module->gsymtab);
+ module->gsymtab = NULL;
+ }
+
+ if (global_symnum > 0)
+ {
+ module->gsymtab = DLIF_malloc(sizeof(struct Elf32_Sym) * global_symnum);
+
+ memcpy(module->gsymtab,
+ &dyn_module->symtab[global_index],
+ sizeof(struct Elf32_Sym) * global_symnum);
+ }
+ module->gsymnum = global_symnum;
+
+ /*------------------------------------------------------------------------*/
+ /* Copy the string table part that contains the global symbol names. */
+ /*------------------------------------------------------------------------*/
+ if (module->gstrtab)
+ {
+ DLIF_free(module->gstrtab);
+ module->gstrtab = NULL;
+ }
+
+ module->gstrsz = dyn_module->strsz - dyn_module->gstrtab_offset;
+ if (module->gstrsz)
+ {
+ module->gstrtab = DLIF_malloc(module->gstrsz);
+
+ memcpy(module->gstrtab,
+ dyn_module->strtab + dyn_module->gstrtab_offset,
+ module->gstrsz);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Update the symbol names of the global symbol entries to point to */
+ /* the symbol names in the string table. */
+ /* NOTE: Note that we don't set the offset into the string table. We */
+ /* instead set the full address so that the st_name field can be accessed */
+ /* as char *. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < global_symnum; i++)
+ {
+
+ Elf32_Word old_offset = dyn_module->symtab[i + global_index].st_name -
+ (Elf32_Addr) dyn_module->strtab;
+ Elf32_Word new_offset = old_offset - dyn_module->gstrtab_offset;
+ struct Elf32_Sym *sym = &((struct Elf32_Sym*)(module->gsymtab))[i];
+ sym->st_name = new_offset + (Elf32_Addr)module->gstrtab;
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Copying symbol: %s\n",
+ (char*)dyn_module->symtab[i + global_index].st_name);
+#endif
+ }
+}
+
+/*****************************************************************************/
+/* BREADTH_FIRST_LOOKUP() - Perform a breadth-first search of the Module */
+/* dependency graph to find specified symbol name (sym_name). */
+/*****************************************************************************/
+static BOOL breadth_first_lookup(DLOAD_HANDLE phandle,
+ const char* sym_name,
+ int handle,
+ Elf32_Addr *sym_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* We start this function by putting the specified file handle on the */
+ /* file_handle_queue. */
+ /*------------------------------------------------------------------------*/
+ LOADER_OBJECT *dHandle = (LOADER_OBJECT *)phandle;
+ Int32_Queue file_handle_queue = TYPE_QUEUE_INITIALIZER;
+ Int32_enqueue(&file_handle_queue, handle);
+
+ /*------------------------------------------------------------------------*/
+ /* While the queue is not empty, keep looking for the symbol. */
+ /*------------------------------------------------------------------------*/
+ while(file_handle_queue.size)
+ {
+ int i;
+
+ /*---------------------------------------------------------------------*/
+ /* Set up a pointer to front of the list of loaded files so that we */
+ /* can be sure that dependent files will be searched in load order. */
+ /*---------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* mod_node =
+ dHandle->DLIMP_loaded_objects.front_ptr;
+ int* dependencies = (int*)(mod_node->value->dependencies.buf);
+
+ /*---------------------------------------------------------------------*/
+ /* Pluck off the file handle at the front of the file_handle_queue. */
+ /* We will search this file next. */
+ /*---------------------------------------------------------------------*/
+ handle = Int32_dequeue(&file_handle_queue);
+
+ /*---------------------------------------------------------------------*/
+ /* Locate the Module associated with the current file handle. */
+ /*---------------------------------------------------------------------*/
+ while (mod_node->value->file_handle != handle) mod_node++;
+
+ /*---------------------------------------------------------------------*/
+ /* Search the symbol table of the current file handle's Module. */
+ /* If the symbol was found, then we're finished. */
+ /*---------------------------------------------------------------------*/
+ if (DLSYM_lookup_global_symtab(sym_name,
+ mod_node->value->gsymtab,
+ mod_node->value->gsymnum,
+ sym_value))
+ return TRUE;
+
+ /*---------------------------------------------------------------------*/
+ /* If our symbol was not in the current Module, then add this Module's */
+ /* dependents to the end of the file_handle_queue. */
+ /*---------------------------------------------------------------------*/
+ for (i = 0; i < mod_node->value->dependencies.size; i++)
+ Int32_enqueue(&file_handle_queue, dependencies[i]);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find our symbol; return FALSE. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_global_lookup() - Search the global symbol table to find the */
+/* definition of the given symbol name. */
+/*****************************************************************************/
+BOOL DLSYM_global_lookup(DLOAD_HANDLE handle,
+ const char *sym_name,
+ DLIMP_Loaded_Module *loaded_module,
+ Elf32_Addr *sym_value)
+{
+ int i = 0;
+ loaded_module_ptr_Queue_Node* node;
+ LOADER_OBJECT *dHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_global_lookup: %s\n", sym_name);
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* We will choose a different lookup algorithm based on what kind of */
+ /* platform we are supporting. In the Braveheart case, the global symbol */
+ /* lookup algorithm searches the base image first, followed by the */
+ /* explicit children of the specified Module. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->direct_dependent_only)
+ {
+ int* child_handle = (int*)(loaded_module->dependencies.buf);
+
+ /*---------------------------------------------------------------------*/
+ /* Spin through list of this Module's dependencies (anything on its */
+ /* DT_NEEDED list), searching through each dependent's symbol table */
+ /* to find the symbol we are after. */
+ /*---------------------------------------------------------------------*/
+ for (i = 0; i < loaded_module->dependencies.size; i++)
+ {
+ for (node = dHandle->DLIMP_loaded_objects.front_ptr;
+ node->value->file_handle != child_handle[i];
+ node=node->next_ptr);
+
+ /*------------------------------------------------------------------*/
+ /* Return true if we find the symbol. */
+ /*------------------------------------------------------------------*/
+ if (DLSYM_lookup_global_symtab(sym_name,
+ node->value->gsymtab,
+ node->value->gsymnum,
+ sym_value))
+ return TRUE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* In the LINUX model, we will use a breadth-first global symbol lookup */
+ /* algorithm. First, the application's global symbol table is searched, */
+ /* followed by its children, followed by their children, and so on. */
+ /* It is up to the client of this module to set the application handle. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ if (breadth_first_lookup(handle, sym_name, DLIMP_application_handle,
+ sym_value))
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If we got this far, then symbol was not found. */
+ /*------------------------------------------------------------------------*/
+ DLIF_error(DLET_SYMBOL, "Could not resolve symbol %s!\n", sym_name);
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_symtab() - Lookup the symbol name in the given symbol table. */
+/* Symbol must have specified binding. Return the */
+/* value in sym_value and return TRUE if the lookup */
+/* succeeds. */
+/*****************************************************************************/
+static BOOL DLSYM_lookup_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value,
+ BOOL require_local_binding)
+{
+ Elf32_Addr sym_idx;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_lookup_symtab, sym to find : %s\n", sym_name);
+#endif
+
+ for (sym_idx = 0; sym_idx < symnum; sym_idx++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("\tPotential symbol match : %s\n",
+ (char*)symtab[sym_idx].st_name);
+#endif
+
+ if ((symtab[sym_idx].st_shndx != SHN_UNDEF) && ((require_local_binding &&
+ (ELF32_ST_BIND(symtab[sym_idx].st_info) == STB_LOCAL)) ||
+ (!require_local_binding &&
+ (ELF32_ST_BIND(symtab[sym_idx].st_info) != STB_LOCAL))) &&
+ !strcmp(sym_name,(char*)(symtab[sym_idx].st_name)))
+ {
+ if (sym_value) *sym_value = symtab[sym_idx].st_value;
+ return TRUE;
+ }
+ }
+ if (sym_value) *sym_value = 0;
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_global_symtab() - Lookup the symbol name in the given symbol */
+/* table. Symbol must have global binding. */
+/* Return the value in sym_value and return */
+/* TRUE if the lookup succeeds. */
+/*****************************************************************************/
+BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value)
+{
+ return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, FALSE);
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_local_symtab() - Lookup the symbol name in the given symbol */
+/* table. Symbol must have local binding. */
+/* Return the value in sym_value and return */
+/* TRUE if the lookup succeeds. */
+/*****************************************************************************/
+BOOL DLSYM_lookup_local_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value)
+{
+ return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, TRUE);
+}
+
+/*****************************************************************************/
+/* CANONICAL_SYMBOL_LOOKUP() - Find the symbol definition. Look up the local */
+/* symbol table to find the symbol. If it is a */
+/* definition and cannot be pre-empted, return */
+/* it. Otherwise, do a look up in the global */
+/* symbol table that contains the symbol tables */
+/* from all the necessary modules. */
+/*****************************************************************************/
+BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle, int sym_index,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Addr *sym_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* Lookup the symbol table to get the symbol characteristics. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Sym *sym = &dyn_module->symtab[sym_index];
+ int32_t st_bind = ELF32_ST_BIND(sym->st_info);
+ int32_t st_vis = ELF32_ST_VISIBILITY(sym->st_other);
+ BOOL is_def = (sym->st_shndx != SHN_UNDEF &&
+ (sym->st_shndx < SHN_LORESERVE ||
+ sym->st_shndx == SHN_ABS ||
+ sym->st_shndx == SHN_COMMON ||
+ sym->st_shndx == SHN_XINDEX));
+ const char *sym_name = (char *)sym->st_name;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_canonical_lookup: %d, %s\n", sym_index, sym_name);
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Local symbols and symbol definitions that cannot be pre-empted */
+ /* are resolved by the definition in the same module. */
+ /*------------------------------------------------------------------------*/
+ if (st_bind == STB_LOCAL || st_vis != STV_DEFAULT)
+ {
+ /*---------------------------------------------------------------------*/
+ /* If it is a local symbol or non-local that cannot be preempted, */
+ /* the definition should be found in the same module. If we don't */
+ /* find the definition it is an error. */
+ /*---------------------------------------------------------------------*/
+ if (!is_def)
+ {
+ DLIF_error(DLET_SYMBOL,
+ "Local/non-imported symbol %s definition is not found "
+ "in module %s!\n", sym_name, dyn_module->name);
+ return FALSE;
+ }
+ else
+ {
+ if (sym_value) *sym_value = sym->st_value;
+ return TRUE;
+ }
+ }
+ /*------------------------------------------------------------------------*/
+ /* Else we have either pre-emptable defintion or undef symbol. We need */
+ /* to do global look up. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ return DLSYM_global_lookup(handle, sym_name, dyn_module->loaded_module,
+ sym_value);
+ }
+}
+
diff --git a/src/core/dsp/ocl_load/README b/src/core/dsp/ocl_load/README
new file mode 100644
index 0000000..19165f6
--- /dev/null
+++ b/src/core/dsp/ocl_load/README
@@ -0,0 +1,8 @@
+
+This program is dependent on these Standard CVS modules
+
+C60_DLOAD_DYN:
+C60_DLOAD_REL:
+DLOAD:
+DLOAD_API:
+DLOAD_SYM:
diff --git a/src/core/dsp/ocl_load/Stack.h b/src/core/dsp/ocl_load/Stack.h
new file mode 100644
index 0000000..e958674
--- /dev/null
+++ b/src/core/dsp/ocl_load/Stack.h
@@ -0,0 +1,182 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/*
+* Stack.h
+*
+* Interface to Stack
+* ------------------
+*
+* This is an implementation of a type-independent stack implemented as
+* a signly linked list class for C. It's basically a template class, but
+* uses macros instead, so that it can be compiled with a C-only compiler.
+*
+* To define a Stack class:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a stack:
+* struct Class_Identifier_Stack name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* initialize_stack_Class_Identifier(&name);
+*
+* To add to the stack:
+* push_Class_Identifier(&name, object);
+*
+* To access the top of the stack:
+* Class_Identifier_Stack_Node *tos = name.top_ptr;
+* do_something_to_(tos->value);
+*
+* To delete from the stack:
+* if (name.size > 0) pop_Class_Identifier(&name);
+*
+* Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef STACK_H
+#define STACK_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */
+/* first-out linked list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_STACK_DEFINITION(t, t_name) \
+struct t_name##_Stack_Node_ \
+{ \
+ t value; \
+ struct t_name##_Stack_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Stack_Node* top_ptr; \
+ t_name##_Stack_Node* bottom_ptr; \
+ int size; \
+} t_name##_Stack; \
+ \
+extern void t_name##_initialize_stack(t_name##_Stack* stack); \
+extern void t_name##_push(t_name##_Stack* stack, t to_push); \
+extern t t_name##_pop(t_name##_Stack* stack);
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */
+/*****************************************************************************/
+#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 }
+
+/*****************************************************************************/
+/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */
+/* list "class" of t_name objects. */
+/* */
+/* <type>_initialize_stack() - clears the stack */
+/* <type>_push() - pushes a <t> type object to the top of the stack */
+/* <type>_pop() - pop a <t> type object from the top of the stack */
+/* and provide access to it to the caller */
+/*****************************************************************************/
+#define TYPE_STACK_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_stack (t_name##_Stack* stack) \
+{ \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ stack->size = 0; \
+} \
+void t_name##_push(t_name##_Stack* stack, t to_push) \
+{ \
+ stack->size++; \
+ \
+ if(!stack->top_ptr) \
+ { \
+ stack->bottom_ptr = stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = NULL; \
+ } \
+ else \
+ { \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr; \
+ stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = next_ptr; \
+ } \
+ \
+ stack->top_ptr->value = to_push; \
+} \
+ \
+t t_name##_pop(t_name##_Stack* stack) \
+{ \
+ t to_ret; \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \
+ \
+ stack->size--; \
+ to_ret = stack->top_ptr->value; \
+ DLIF_free((void*)(stack->top_ptr)); \
+ \
+ if(!stack->size) \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ else \
+ stack->top_ptr = next_ptr; \
+ \
+ return to_ret; \
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/ocl_load.c b/src/core/dsp/ocl_load/ocl_load.c
new file mode 100644
index 0000000..c53a137
--- /dev/null
+++ b/src/core/dsp/ocl_load/ocl_load.c
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "dload_api.h"
+
+#define TYPE_STACK_DEFINITION(t, t_name)
+#define TYPE_STACK_IMPLEMENTATION(t, t_name)
+
+int debugging_on = FALSE;
+int profiling_on = FALSE;
+
+int global_argc;
+char **global_argv;
+
+int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin)
+ { return fseek(stream, offset, origin); }
+
+
+size_t DLIF_fread(void *ptr, size_t size, size_t nmemb,
+ LOADER_FILE_DESC *stream)
+ { return fread(ptr, size, nmemb, stream); }
+
+int32_t DLIF_ftell (LOADER_FILE_DESC *stream) { return ftell(stream); }
+int32_t DLIF_fclose(LOADER_FILE_DESC *fd) { return fclose(fd); }
+void* DLIF_malloc(size_t size) { return malloc(size); }
+void DLIF_free (void* ptr) { free(ptr); }
+
+/*****************************************************************************/
+/* DLIF_COPY() - Copy data from file to host-accessible memory. */
+/* Returns a host pointer to the data in the host_address field of the */
+/* DLOAD_MEMORY_REQUEST object. */
+/*****************************************************************************/
+BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* targ_req)
+{
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment;
+ LOADER_FILE_DESC* f = targ_req->fp;
+ void *buf = calloc(obj_desc->memsz_in_bytes, 1);
+
+ fseek(f, targ_req->offset, SEEK_SET);
+
+ int result = 1;
+ if (obj_desc->objsz_in_bytes)
+ result = fread(buf, obj_desc->objsz_in_bytes, 1, f);
+
+ assert(result == 1);
+
+ targ_req->host_address = buf;
+
+ return 1;
+}
+
+BOOL DLIF_read(void* client_handle,
+ void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src)
+ { assert(0); }
+
+BOOL DLIF_memcpy(void* client_handle,
+ void *to, void *from, size_t size)
+ { return (!memcpy(to, from, size)) ? 0 : 1; }
+
+int32_t DLIF_execute(void* client_handle,
+ TARGET_ADDRESS exec_addr) { assert(0); return 1; }
+
+
+
+
+BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle,
+ const char *requestor_name,
+ int32_t requestor_file_handle,
+ int32_t requested_dsbt_index)
+ { assert(0); }
+
+void DLIF_assign_dsbt_indices(void) { assert(0); }
+
+int32_t DLIF_get_dsbt_index(int32_t file_handle)
+ { assert(0); return DSBT_INDEX_INVALID; }
+
+BOOL DLIF_update_all_dsbts() { assert(0); return TRUE; }
+
+void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ printf("<< D L O A D >> WARNING: ");
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ printf("<< D L O A D >> ERROR: ");
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_trace(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_exit(ecode)
+{
+ exit(ecode);
+}
+
diff --git a/src/core/dsp/program.cpp b/src/core/dsp/program.cpp
new file mode 100644
index 0000000..6495ec9
--- /dev/null
+++ b/src/core/dsp/program.cpp
@@ -0,0 +1,633 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "program.h"
+#include "device.h"
+#include "kernel.h"
+
+#include "../program.h"
+
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include "wga.h"
+
+#include <llvm/LinkAllPasses.h>
+#include <WorkitemHandlerChooser.h>
+#include <BreakConstantGEPs.h>
+#include <Flatten.h>
+#include <PHIsToAllocas.h>
+#include <IsolateRegions.h>
+#include <VariableUniformityAnalysis.h>
+#include <ImplicitLoopBarriers.h>
+#include <LoopBarriers.h>
+#include <BarrierTailReplication.h>
+#include <CanonicalizeBarriers.h>
+#include <WorkItemAliasAnalysis.h>
+#include <WorkitemReplication.h>
+#include <WorkitemLoops.h>
+#include <AllocasToEntry.h>
+#include <Workgroup.h>
+#include <TargetAddressSpaces.h>
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <elf.h>
+
+#include "genfile_cache.h"
+
+genfile_cache * genfile_cache::pInstance = 0;
+
+timespec getTime()
+{
+ struct timespec tp;
+ if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0)
+ clock_gettime(CLOCK_REALTIME, &tp);
+ return tp;
+}
+
+double ts_to_double(const timespec &t)
+ { return ((double)t.tv_nsec) /1000000000.0 + (double)t.tv_sec; }
+
+double tsdiff (const timespec& start, const timespec& end)
+ { return ts_to_double(end) - ts_to_double(start); }
+
+
+using namespace Coal;
+
+DSPProgram::DSPProgram(DSPDevice *device, Program *program)
+: DeviceProgram(), p_device(device), p_program(program), p_program_handle(-1), p_loaded(false), p_keep_files(false),
+ p_cache_kernels(true)
+{
+ char *keep = getenv("TI_OCL_KEEP_FILES");
+ if (keep) p_keep_files = true;
+
+ char *cache = getenv("TI_OCL_CACHE_KERNELS_OFF");
+ if (cache) p_cache_kernels = false;
+}
+
+DSPProgram::~DSPProgram()
+{
+ p_device->unload(p_program_handle);
+ if (!p_keep_files && !p_cache_kernels) unlink(p_outfile);
+}
+
+DSPProgram::segment_list *segments;
+
+bool DSPProgram::load()
+{
+ segments = &p_segments_written;
+
+ p_program_handle = p_device->load(p_outfile);
+ if (!p_program_handle) return false;
+
+ segments = NULL;
+ p_loaded = true;
+
+ char *debug_kernel = getenv("TI_OCL_DEBUG_KERNEL");
+
+ /*-------------------------------------------------------------------------
+ * ensure that the newly populated areas are not stale in device caches
+ *------------------------------------------------------------------------*/
+ Msg_t msg;
+ int segNum = p_segments_written.size();
+
+ assert(segNum <= MAX_FLUSH_BUF_SIZE/2);
+
+ msg.command = CACHEINV;
+ msg.u.k.flush.numBuffers = segNum;
+ msg.u.k.flush.num_mpaxs = 0;
+ for (int i=0; i < segNum; ++i)
+ {
+ msg.u.k.flush.buffers[2*i] = p_segments_written[i].ptr;
+ msg.u.k.flush.buffers[2*i+1] = p_segments_written[i].size;
+
+ uint32_t flags = p_segments_written[i].flags &
+ (DLOAD_SF_executable | DLOAD_SF_writable);
+
+ const char *seg_desc;
+ switch (flags)
+ {
+ case 0: seg_desc = "Read Only"; break;
+ case DLOAD_SF_executable: seg_desc = "Executable"; break;
+ case DLOAD_SF_writable: seg_desc = "Writable"; break;
+ default: seg_desc = "Writable & Executable"; break;
+ }
+
+ if (debug_kernel)
+ printf("%s segment loaded to 0x%08x with size 0x%x\n",
+ seg_desc, p_segments_written[i].ptr, p_segments_written[i].size);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Send the command and wait for the ready response.
+ *------------------------------------------------------------------------*/
+ p_device->mail_to(msg);
+
+ /*-------------------------------------------------------------------------
+ * We do not wait here. The wait will be handled by the standard wait loop
+ * int the worker thread.
+ *------------------------------------------------------------------------*/
+ return true;
+}
+
+bool DSPProgram::is_loaded() const
+{
+ return p_loaded;
+}
+
+bool DSPProgram::linkStdLib() const
+{
+ return false;
+}
+
+const char* DSPProgram::outfile_name() const
+{
+ return p_outfile;
+}
+
+DSPDevicePtr DSPProgram::data_page_ptr()
+{
+ DSPDevicePtr p;
+
+ if (!is_loaded()) load();
+
+ DLOAD_get_static_base(p_device->dload_handle(), p_program_handle, &p);
+ return p;
+}
+
+void DSPProgram::createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier)
+{
+ if (hasBarrier)
+ {
+ manager->add(new llvm::DominatorTree());
+ manager->add(new pocl::WorkitemHandlerChooser());
+ manager->add(new BreakConstantGEPs()); // from pocl
+ // add(new GenerateHeader()); // no need
+ manager->add(new pocl::Flatten());
+ manager->add( llvm::createAlwaysInlinerPass());
+ manager->add( llvm::createGlobalDCEPass());
+ manager->add( llvm::createCFGSimplificationPass());
+ manager->add( llvm::createLoopSimplifyPass());
+ manager->add(new pocl::PHIsToAllocas());
+ manager->add( llvm::createRegionInfoPass());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::VariableUniformityAnalysis()); // TODO
+ manager->add(new pocl::ImplicitLoopBarriers());
+ manager->add(new pocl::LoopBarriers());
+ manager->add(new pocl::BarrierTailReplication());
+ manager->add(new pocl::CanonicalizeBarriers());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::WorkItemAliasAnalysis());
+ // add(new pocl::WorkitemReplication()); // no need
+ manager->add(new pocl::WorkitemLoops());
+ manager->add(new pocl::AllocasToEntry());
+ // add(new pocl::Workgroup()); // no need
+ manager->add(new pocl::TargetAddressSpaces());
+ }
+
+ if (optimize)
+ {
+ /*
+ * Inspired by code from "The LLVM Compiler Infrastructure"
+ */
+ manager->add(llvm::createDeadArgEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createFunctionInliningPass());
+ manager->add(llvm::createPruneEHPass()); // Remove dead EH info.
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createGlobalDCEPass()); // Remove dead functions.
+ manager->add(llvm::createArgumentPromotionPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+
+ //ASW TODO maybe turn off re: pete. might gen bad xlator input
+ //manager->add(llvm::createScalarReplAggregatesPass());
+
+ manager->add(llvm::createFunctionAttrsPass()); // Add nocapture.
+ manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis.
+ manager->add(llvm::createLICMPass()); // Hoist loop invariants.
+ manager->add(llvm::createGVNPass()); // Remove redundancies.
+ manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys.
+ manager->add(llvm::createDeadStoreEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createCFGSimplificationPass());
+ }
+
+ manager->add(llvm::createUnifyFunctionExitNodesPass());
+ manager->add(llvm::createTIOpenclWorkGroupAggregationPass(hasBarrier));
+
+ /*-------------------------------------------------------------------------
+ * Borrow the pocl alloca hoister for the TI simplistic WGA pass as well
+ *------------------------------------------------------------------------*/
+ if (!hasBarrier)
+ manager->add(new pocl::AllocasToEntry());
+}
+
+
+std::string process_cl6x_options(std::string options)
+{
+ std::istringstream options_stream(options);
+ std::string token;
+ std::string result;
+
+ while (options_stream >> token)
+ {
+ if ((token.find(".obj") != std::string::npos) ||
+ (token.find(".dll") != std::string::npos) ||
+ (token.find(".ae66") != std::string::npos) ||
+ (token.find(".a66") != std::string::npos) ||
+ (token.find(".out") != std::string::npos) ||
+ (token.find(".lib") != std::string::npos) ||
+ (token.find(".o") != std::string::npos) ||
+ (token.find(".o66") != std::string::npos) ||
+ (token.find(".oe66") != std::string::npos) ||
+ (token.find(".a") != std::string::npos) ||
+ (token.find(".cmd") != std::string::npos))
+ result += token + " ";
+ }
+ return result;
+}
+
+/******************************************************************************
+* Find the C6000 CGT installation
+******************************************************************************/
+char *get_cgt_install()
+{
+ char *install = getenv("TI_OCL_CGT_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_CGT_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the C6000 compiler tools are installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+/******************************************************************************
+* Find the OpenCL installation
+******************************************************************************/
+char *get_ocl_install()
+{
+ char *install = getenv("TI_OCL_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the TI OpenCL product is installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+std::string get_ocl_dsp()
+{
+ static std::string sinstall;
+
+ if (sinstall.empty())
+ {
+ struct stat st;
+ const char *stdpath = "/usr/share/ti/opencl/dsp";
+ if (stat(stdpath, &st) == 0)
+ sinstall = string(stdpath);
+ else sinstall = string(get_ocl_install()) + "/dsp";
+ }
+
+ return sinstall;
+}
+
+/******************************************************************************
+* run_cl6x
+******************************************************************************/
+static int run_cl6x(char *filename, std::string *llvm_bitcode,
+ bool keep_files, std::string options)
+{
+ std::string command("cl6x --f -q --abi=eabi --use_g3 -mv6600 -mt -mo "
+ "-ft=/tmp -fs=/tmp -fr=/tmp ");
+
+ if (keep_files) command += "-mw -k --z ";
+
+ /*-------------------------------------------------------------------------
+ * Turned off for now to workaround a timing bug. Plan to re-enable later
+ *------------------------------------------------------------------------*/
+ command += "--disable:sploop ";
+
+ char *cl6x_debug = getenv("TI_OCL_CL6X_DEBUG");
+
+ if (cl6x_debug) command += "-g -o0 ";
+ else command += "-o3 ";
+
+ char *no_sp = getenv("TI_OCL_SOFTWARE_PIPELINE_OFF");
+ if (no_sp) command += "-mu ";
+
+ char *cgt_install = get_cgt_install();
+
+ command += "-I"; command += cgt_install; command += "/include ";
+ command += "-I"; command += cgt_install; command += "/lib ";
+ command += "-I"; command += get_ocl_dsp().c_str(); command += " ";
+
+ command += "--bc_file="; command += filename; command += " ";
+
+ /*-------------------------------------------------------------------------
+ * Encode LLVM bitcode as bytes in the .llvmir section of the .asm file
+ *------------------------------------------------------------------------*/
+ if (llvm_bitcode != NULL)
+ {
+ char bitasm_name[32];
+ strcpy(bitasm_name, filename);
+ strcat(bitasm_name, "_bc.asm");
+ std::ofstream outasmfile(bitasm_name, std::ios::out);
+ outasmfile << "\t.sect \".llvmir\"\n" << "\t.retain";
+ int nbytes = llvm_bitcode->size();
+ for (int i = 0; i < nbytes; i++)
+ if (i % 10 == 0)
+ outasmfile << "\n\t.byte " << (int) llvm_bitcode->at(i);
+ else
+ outasmfile << ", " << (int) llvm_bitcode->at(i);
+ outasmfile.close();
+
+ command += bitasm_name; command += " ";
+ }
+
+ command += "-z -ldsp.syms -o ";
+ command += filename; command += ".out ";
+
+ if (keep_files)
+ { command += "-m "; command += filename; command += ".map "; }
+
+ /*-------------------------------------------------------------------------
+ * Any libraries or object files need to go last to resolve references
+ *------------------------------------------------------------------------*/
+ command += process_cl6x_options(options);
+
+ //timespec t0, t1;
+ //clock_gettime(CLOCK_MONOTONIC, &t0);
+ int x = system(command.c_str());
+ //clock_gettime(CLOCK_MONOTONIC, &t1);
+ //printf("cl6x time: %6.4f secs\n",
+ // (float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9);
+
+ if (!cl6x_debug)
+ {
+ std::string strip_command("strip6x ");
+ strip_command += filename; strip_command += ".out";
+ x = system(strip_command.c_str());
+ }
+}
+
+/**
+ * Extract llvm bitcode and native binary from MixedBinary
+ */
+bool DSPProgram::ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native)
+{
+ if (binary_str == NULL) return false;
+ if (strncmp(&binary_str->at(0), ELFMAG, SELFMAG) != 0) return false;
+
+ /*-------------------------------------------------------------------------
+ * Parse ELF file format, extract ".llvmir" section into bitcode
+ * Valid Assumptions: 1. cl6x only creates 32-bit ELF files (for now)
+ * 2. cl6x ELF file has the same endianness as the host
+ *------------------------------------------------------------------------*/
+ if (bitcode != NULL)
+ {
+ Elf32_Ehdr ehdr; /* memcpy into here to guarantee proper alignment */
+ memcpy(&ehdr, & binary_str->at(0), sizeof(Elf32_Ehdr));
+ int n_sects = ehdr.e_shnum;
+ int shoff = ehdr.e_shoff;
+ int shstr_sect = ehdr.e_shstrndx;
+
+ Elf32_Shdr shdr; /* memcpy into here to guarantee proper alignment */
+ int shsize = sizeof(Elf32_Shdr);
+ memcpy(&shdr, & binary_str->at(shoff + shstr_sect * shsize), shsize);
+ char *strtab = & binary_str->at(shdr.sh_offset);
+
+ int i;
+ for (i = 0; i < n_sects; i++)
+ {
+ if (i == shstr_sect) continue;
+ memcpy(&shdr, & binary_str->at(shoff + i * shsize), shsize);
+ if (strcmp(&strtab[shdr.sh_name], ".llvmir") == 0) break;
+ }
+ if (i >= n_sects) return false;
+
+ bitcode->clear();
+ bitcode->append(& binary_str->at(shdr.sh_offset), shdr.sh_size);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Return the c6x ELF file in binary_str as native binary
+ *------------------------------------------------------------------------*/
+ if (native != NULL)
+ {
+ native->clear();
+ native->append(*binary_str);
+ }
+
+ return true;
+}
+
+
+/**
+ * Write native binary into file, create tmporary filename in p_outfile
+ */
+void DSPProgram::WriteNativeOut(std::string *native)
+{
+ try
+ {
+ char name_out[] = "/tmp/openclXXXXXX";
+ int fOutfile = mkstemp(name_out);
+ strcpy(p_outfile, name_out);
+ strcat(p_outfile, ".out");
+
+ std::ofstream outfile(p_outfile, std::ios::out | std::ios::binary);
+ outfile.write(native->data(), native->size());
+ outfile.close();
+ close(fOutfile);
+ }
+ catch(...) { std::cout << "ERROR: Binary write out failure" << std::endl; }
+}
+
+/**
+ * Native binary is stored in file, filename in p_outfile
+ * Input: binary_str contains only the bitcode
+ * Output: binary_str contains c6x ELF file with bitcode in ".llvmir" section
+ */
+void DSPProgram::ReadEmbeddedBinary(std::string *binary_str)
+{
+ if (binary_str == NULL) return;
+
+ int length;
+ char *buffer = NULL;
+
+ try
+ {
+ std::ifstream is;
+ is.open(p_outfile, std::ios::binary);
+ is.seekg(0, std::ios::end);
+ length = is.tellg();
+ is.seekg(0, std::ios::beg);
+ buffer = new char[length];
+ is.read(buffer, length);
+ is.close();
+
+ binary_str->clear();
+ binary_str->append(buffer, length);
+ delete [] buffer;
+ }
+ catch(...) { std::cout << "ERROR: Binary read in failure" << std::endl; }
+}
+
+bool DSPProgram::build(llvm::Module *module, std::string *binary_str)
+{
+ p_module = module;
+
+ /*------------------------------------------------------------------------
+ * The input binary_str could be any of the following:
+ * 1. Mixed C6x binary embedded with LLVM bitcode, extract C6x native
+ * binary and return. There is no need to rebuild from LLVM module.
+ * 2. LLVM bitcode, proceed to the regular build:
+ * 2.1 return a corresponding cached c6x binary, if found
+ * 2.2 invoke c6x compiler toolchain, embed LLVM bitcode, build
+ * In either case, put c6x binary in binary_str when return
+ *------------------------------------------------------------------------*/
+ std::string native;
+ if (ExtractMixedBinary(binary_str, NULL, &native))
+ {
+ WriteNativeOut(&native);
+ return true;
+ }
+
+ if (p_cache_kernels)
+ {
+ string cached_outfile = genfile_cache::instance()->lookup
+ (p_module, p_program->deviceDependentCompilerOptions(p_device));
+
+ if (!cached_outfile.empty())
+ {
+ strcpy(p_outfile, cached_outfile.c_str());
+ ReadEmbeddedBinary(binary_str);
+ return true;
+ }
+ }
+
+ char name_template[] = "/tmp/openclXXXXXX";
+ int pFile = mkstemp(name_template);
+
+ strcpy(p_outfile, name_template);
+ strcat(p_outfile, ".out");
+
+ if (pFile != -1)
+ {
+ if (p_keep_files)
+ {
+ //write out the source as well
+
+ std::string filename(name_template);
+ filename += ".cl";
+ std::ofstream out(filename.c_str());
+ out << p_program->source();
+ out.close();
+ }
+
+ llvm::raw_fd_ostream ostream(pFile, false);
+ llvm::WriteBitcodeToFile(p_module, ostream);
+ ostream.flush();
+
+ run_cl6x(name_template, binary_str, p_keep_files,
+ p_program->deviceDependentCompilerOptions(p_device));
+
+ if (!p_keep_files)
+ {
+ unlink(name_template);
+
+ char objfile[32];
+ strcpy(objfile, name_template);
+ strcat(objfile, ".obj");
+ unlink(objfile);
+
+ if (binary_str != NULL)
+ {
+ strcpy(objfile, name_template);
+ strcat(objfile, "_bc.asm");
+ unlink(objfile);
+
+ strcpy(objfile, name_template);
+ strcat(objfile, "_bc.obj");
+ unlink(objfile);
+ }
+ }
+
+ if (p_cache_kernels)
+ genfile_cache::instance()->remember(p_outfile, p_module,
+ p_program->deviceDependentCompilerOptions(p_device));
+
+ ReadEmbeddedBinary(binary_str);
+ }
+
+ if (pFile != -1) close(pFile);
+
+ return true;
+}
+
+DSPDevicePtr DSPProgram::query_symbol(const char *symname)
+{
+ DSPDevicePtr addr;
+
+ bool found = DLOAD_query_symbol(p_device->dload_handle(), p_program_handle,
+ symname, &addr);
+
+ return (found) ? addr : 0;
+}
+
diff --git a/src/core/dsp/program.h b/src/core/dsp/program.h
new file mode 100644
index 0000000..63c1858
--- /dev/null
+++ b/src/core/dsp/program.h
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_PROGRAM_H__
+#define __DSP_PROGRAM_H__
+
+#include "device.h"
+#include "../deviceinterface.h"
+#include <vector>
+
+namespace llvm
+{
+ class ExecutionEngine;
+ class Module;
+}
+
+namespace Coal
+{
+
+class DSPDevice;
+class Program;
+
+class DSPProgram : public DeviceProgram
+{
+ public:
+ struct seg_desc
+ {
+ seg_desc(DSPDevicePtr p, int s, uint32_t f) :
+ ptr(p), size(s), flags(f) {}
+ DSPDevicePtr ptr;
+ unsigned size;
+ uint32_t flags;
+ };
+
+ typedef std::vector<seg_desc> segment_list;
+
+ public:
+ DSPProgram(DSPDevice *device, Program *program);
+ ~DSPProgram();
+
+ bool linkStdLib() const;
+ const char* outfile_name() const;
+ void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false);
+ bool build(llvm::Module *module, std::string *binary_str);
+ bool ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native);
+ void WriteNativeOut(std::string *native);
+ void ReadEmbeddedBinary(std::string *binary_str);
+
+ DSPDevicePtr query_symbol(const char *symname);
+ DSPDevicePtr data_page_ptr();
+ bool load();
+ bool is_loaded() const;
+
+ private:
+ DSPDevice *p_device;
+ Program *p_program;
+ llvm::Module *p_module;
+ int p_program_handle;
+ char p_outfile[32];
+ bool p_loaded;
+ segment_list p_segments_written;
+ bool p_keep_files;
+ bool p_cache_kernels;
+};
+}
+#endif
diff --git a/src/core/dsp/shmem.cpp b/src/core/dsp/shmem.cpp
new file mode 100644
index 0000000..6aec2f8
--- /dev/null
+++ b/src/core/dsp/shmem.cpp
@@ -0,0 +1,539 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "shmem.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <ti/cmem.h>
+
+#define REPORT(x) printf(x "\n")
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+/******************************************************************************
+* shmem::shmem
+******************************************************************************/
+shmem::shmem()
+ : p_dsp_addr(0), p_size(0), p_page_size(sysconf(_SC_PAGE_SIZE)), p_mmap_fd(-1)
+ , p_mpm_transport_handle(NULL)
+
+{ }
+
+/******************************************************************************
+* shmem::~shmem
+******************************************************************************/
+shmem::~shmem()
+{
+ if (p_mmap_fd != -1) close(p_mmap_fd);
+}
+
+/******************************************************************************
+* shmem::configure
+******************************************************************************/
+void shmem::configure_base(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ /*-------------------------------------------------------------------------
+ * If the sysconf for the page size failed
+ *------------------------------------------------------------------------*/
+ if (p_page_size <= 0) { REPORT("Failed to get PAGE_SIZE"); return; }
+
+ // p_mmap_fd = open("/dev/mem", (O_RDWR | O_SYNC));
+ // Now we use mpm_transport_{open, mmap, munmap, close}
+ /*-------------------------------------------------------------------------
+ * core1-core7's l2 go through /dev/dsp{1-7}
+ * everything else (core0's l2, msmc, global addr) go through /dev/dsp0
+ *------------------------------------------------------------------------*/
+ char devname[16];
+ strcpy(devname, "dsp0");
+ if (0x11800000 <= dsp_addr & dsp_addr < 0x17900000)
+ devname[3] = ((dsp_addr >> 24) - 0x10) + '0';
+ mpm_transport_open_t mpm_transport_open_cfg;
+ mpm_transport_open_cfg.open_mode = (O_SYNC|O_RDWR);
+ p_mpm_transport_handle = mpm_transport_open(devname,
+ &mpm_transport_open_cfg);
+
+ /*-------------------------------------------------------------------------
+ * If the open failed
+ *------------------------------------------------------------------------*/
+ // if (p_mmap_fd == -1) { REPORT("Failed to open /dev/mem"); return; }
+ if (p_mpm_transport_handle == NULL)
+ {
+ printf("Failed to open /dev/%s", devname);
+ return;
+ }
+
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+}
+
+
+/******************************************************************************
+* shmem_persistent::shmem
+******************************************************************************/
+#define MULTIPLE_OF_POW2(x, y) (((x) & ((y)-1)) != 0 ? false : true)
+
+shmem_persistent::shmem_persistent()
+ : p_host_addr(0), p_xlate_dsp_to_host_offset(0)
+{ }
+
+/******************************************************************************
+* shmem_persistent::configure
+******************************************************************************/
+void shmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ configure_base(dsp_addr, size);
+
+ /*-------------------------------------------------------------------------
+ * if base class failed to construct, because /dev/mem could not be opened
+ *------------------------------------------------------------------------*/
+ // if (p_mmap_fd == -1) return;
+ if (p_mpm_transport_handle == NULL) return;
+
+ if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return;
+ }
+
+ if (!MULTIPLE_OF_POW2(size, p_page_size))
+ {
+ REPORT("Mapped region size is not a multiple of page size");
+ return;
+ }
+
+ //p_host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED, p_mmap_fd,
+ // (off_t)dsp_addr);
+ mpm_transport_mmap_t mpm_transport_mmap_cfg;
+ mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE);
+ mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED;
+
+ p_host_addr = (void *)mpm_transport_mmap(p_mpm_transport_handle,
+ dsp_addr, size,
+ &mpm_transport_mmap_cfg);
+
+ // if (p_host_addr == MAP_FAILED)
+ if (p_host_addr == (void *) -1)
+ {
+ REPORT("Failed to mmap");
+ p_host_addr = 0;
+ return;
+ }
+
+ p_xlate_dsp_to_host_offset = (void*)((int64_t)p_host_addr - dsp_addr);
+}
+
+/******************************************************************************
+* shmem_persistent::~shmem_persistent
+******************************************************************************/
+shmem_persistent::~shmem_persistent()
+{
+ // if (p_host_addr) munmap(p_host_addr, p_size);
+ if (p_host_addr)
+ mpm_transport_munmap(p_mpm_transport_handle, p_host_addr, p_size);
+}
+
+/******************************************************************************
+* shmem_persistent::map
+******************************************************************************/
+void *shmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!p_host_addr) return 0;
+
+ if (dsp_addr >= p_dsp_addr && dsp_addr + size <= p_dsp_addr + p_size)
+ return dsp_addr + (char*)p_xlate_dsp_to_host_offset;
+ else
+ {
+ REPORT("Attempting to map a region outside a defined area");
+ return 0;
+ }
+}
+
+/******************************************************************************
+* shmem_persistent::unmap
+******************************************************************************/
+void shmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ // if (host_addr) msync(host_addr, size, MS_SYNC);
+}
+
+
+
+/******************************************************************************
+* shmem_ondemand::shmem_ondemap
+******************************************************************************/
+shmem_ondemand::shmem_ondemand()
+{ }
+
+/******************************************************************************
+* shmem::~shmem
+******************************************************************************/
+shmem_ondemand::~shmem_ondemand()
+{
+}
+
+/******************************************************************************
+* shmem_ondemand::configure
+******************************************************************************/
+void shmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ configure_base(dsp_addr, size);
+}
+
+
+/******************************************************************************
+* shmem_ondemand::map
+******************************************************************************/
+void *shmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return 0;
+ }
+
+ if (!MULTIPLE_OF_POW2(size, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return 0;
+ }
+
+ if (dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size)
+ {
+ REPORT("Attempting to map a region outside a defined area");
+ return 0;
+ }
+
+ //void *host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED,
+ // p_mmap_fd, (off_t)dsp_addr);
+ mpm_transport_mmap_t mpm_transport_mmap_cfg;
+ mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE);
+ mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED;
+
+ void * host_addr = mpm_transport_mmap(p_mpm_transport_handle,
+ dsp_addr, size,
+ &mpm_transport_mmap_cfg);
+
+ // if (host_addr == MAP_FAILED)
+ if (host_addr == (void *) -1)
+ {
+ REPORT("Failed to mmap");
+ return 0;
+ }
+
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_ondemand::unmap
+******************************************************************************/
+void shmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ // if (host_addr) munmap(host_addr, size);
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::shmem
+******************************************************************************/
+shmem_cmem_persistent::shmem_cmem_persistent(int cmem_block)
+ : p_host_addr(0), p_xlate_dsp_to_host_offset(0), p_cmem_block(cmem_block)
+{ }
+
+/******************************************************************************
+* shmem_cmem_persistent::init
+* TODO: remove addr3, size3 once uboot is updated, so that we don't have
+* have fragemented CMEM blocks for DDR
+******************************************************************************/
+void shmem_cmem_persistent::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3)
+{
+ /*-------------------------------------------------------------------------
+ * Assume this is the only use of CMEM, so we reset everything
+ *------------------------------------------------------------------------*/
+#if 0
+ const char *cmem_command = "modprobe -r cmemk; modprobe cmemk "
+ "phys_start=0xa2000000 phys_end=0x100000000 pools=1x1577058304 "
+ "phys_start_1=0x0c000000 phys_end_1=0x0c500000 pools_1=1x5242880 "
+ "allowOverlap=1";
+
+ int result = system(cmem_command);
+#endif
+
+ const char *cmem_command = "For available CMEM DDR block size: ~1.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0x880000000 pools=1x1560281088 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+ const char *cmem_command2 = "For available CMEM DDR block size: ~3.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0x900000000 pools=1x3707764736 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+ const char *cmem_command3 = "For available CMEM DDR block size: ~7.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0xA00000000 pools=1x8002732032 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+
+ /*-------------------------------------------------------------------------
+ * First initialize the CMEM module
+ *------------------------------------------------------------------------*/
+ if (CMEM_init() == -1)
+ {
+ printf("\nThe cmemk kernel module does not appear to installed.\n\n"
+ "Commands such as the following run as root would "
+ "install cmemk\n"
+ "and allow OpenCL to proceed properly. The actual memory "
+ "address values for\n"
+ "your system may differ.\n\n");
+ printf("%s\n\n", cmem_command);
+ printf("%s\n\n", cmem_command2);
+ printf("%s\n\n", cmem_command3);
+ exit(-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Debug to see in cmem init was correct
+ *------------------------------------------------------------------------*/
+ int num_Blocks = 0;
+ CMEM_getNumBlocks(&num_Blocks);
+ if (num_Blocks < 2)
+ {
+ printf("\nOpenCL needs at least two CMEM blocks to operate properly.\n"
+ "One for DDR, the other for MSMC. Example commands:\n");
+ printf("%s\n\n", cmem_command);
+ printf("%s\n\n", cmem_command2);
+ printf("%s\n\n", cmem_command3);
+ exit(-1);
+ }
+
+ CMEM_BlockAttrs pattrs0 = {0, 0};
+ CMEM_BlockAttrs pattrs1 = {0, 0};
+ CMEM_BlockAttrs pattrs2 = {0, 0};
+
+ CMEM_getBlockAttrs(0, &pattrs0);
+ CMEM_getBlockAttrs(1, &pattrs1);
+ if (num_Blocks > 2)
+ CMEM_getBlockAttrs(2, &pattrs2);
+
+ /*-------------------------------------------------------------------------
+ * Return 36-bit addr, and up to 7.5G memory size
+ *------------------------------------------------------------------------*/
+ *addr1 = (DSPDevicePtr64) pattrs0.phys_base;
+ *size1 = (uint64_t) pattrs0.size;
+ // Persistent CMEM should start within 0x8:2200_0000 - 0x8:4000_0000
+ if (*addr1 >= MPAX_USER_MAPPED_DSP_ADDR)
+ {
+ printf("Unable to allocate OCL persistent CMem from 0x%llx\n",
+ pattrs0.phys_base);
+ exit(EXIT_FAILURE);
+ }
+
+ *addr2 = pattrs1.phys_base;
+ *size2 = pattrs1.size;
+ if (*addr2 < MSMC_OCL_START_ADDR || *addr2 >= MSMC_OCL_END_ADDR)
+ {
+ printf("Unable to allocate OCL MSMC memory from 0x%llx\n",
+ pattrs1.phys_base);
+ exit(EXIT_FAILURE);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Grab all available CMEM physical address, to be managed by OCL
+ *------------------------------------------------------------------------*/
+ DSPDevicePtr64 alloc_dsp_addr = 0;
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_POOL;
+ alloc_dsp_addr = CMEM_allocPoolPhys2(0, 0, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr1)
+ {
+ printf("Failed to allocate 0x%llx from CMem 0, allocated=0x%llx\n",
+ *size1, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+
+ params.type = CMEM_HEAP;
+ alloc_dsp_addr = CMEM_allocPhys2(1, *size2, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr2)
+ {
+ printf("Failed to allocate 0x%x from CMem 1, allocated=0x%llx\n",
+ *size2, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+
+ if (num_Blocks > 2)
+ {
+ *addr3 = pattrs2.phys_base;
+ *size3 = pattrs2.size;
+ params.type = CMEM_POOL;
+ alloc_dsp_addr = CMEM_allocPoolPhys2(2, 0, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr3)
+ {
+ printf("Failed to allocate 0x%llx from CMem 2, allocated=0x%llx\n",
+ *size3, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ *addr3 = 0;
+ *size3 = 0;
+ }
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::cmem_exit
+******************************************************************************/
+void shmem_cmem_persistent::cmem_exit()
+{
+ /* Finalize the CMEM module */
+ if (CMEM_exit() == -1) ERR(1, "Failed to finalize CMEM");
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::configure
+******************************************************************************/
+void shmem_cmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+ DSPDevicePtr64 cmem_addr = p_dsp_addr;
+ if (p_dsp_addr >= 0xA0000000 && p_dsp_addr < 0xFFFFFFFF)
+ cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL;
+ p_host_addr = CMEM_map(cmem_addr, size);
+ if (! p_host_addr)
+ ERR(1, "Cannot map CMEM physical memory into the Host virtual address space.\n"
+ " This is typically due to Linux system memory being near capacity.");
+ p_xlate_dsp_to_host_offset = (int64_t)p_host_addr - dsp_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::~shmem_cmem_persistent
+******************************************************************************/
+shmem_cmem_persistent::~shmem_cmem_persistent()
+{
+ if (p_dsp_addr == 0) return;
+
+ if (p_host_addr != NULL) CMEM_unmap(p_host_addr, p_size);
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ DSPDevicePtr64 cmem_addr = p_dsp_addr;
+ if (p_dsp_addr > 0xA0000000 && p_dsp_addr < 0xFFFFFFFF)
+ cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL;
+ CMEM_freePhys(cmem_addr, &params);
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::map: dsp_addr (phys) -> host_addr (virt)
+******************************************************************************/
+void *shmem_cmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!p_host_addr ||
+ dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size)
+ {
+ ERR(1, "Attempting to cmem_map a region outside a defined area");
+ return NULL;
+ }
+
+ void *host_addr = dsp_addr + (char*)p_xlate_dsp_to_host_offset;
+ if (is_read) CMEM_cacheInv(host_addr, size);
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::unmap: flush host side writes
+******************************************************************************/
+void shmem_cmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ if (host_addr && is_write) CMEM_cacheWb(host_addr, size);
+}
+
+
+/******************************************************************************
+* shmem_cmem_ondeman::configure
+******************************************************************************/
+void shmem_cmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::map: dsp_addr (phys) -> host_addr (virt)
+******************************************************************************/
+void *shmem_cmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ void *host_addr = CMEM_map(dsp_addr, size);
+ if (! host_addr) ERR(1, "Failed to map CMEM address (ondemand)");
+ if (is_read) CMEM_cacheInv(host_addr, size);
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::unmap: flush host side writes
+******************************************************************************/
+void shmem_cmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ if (host_addr && is_write) CMEM_cacheWb(host_addr, size);
+ if (host_addr) CMEM_unmap(host_addr, size);
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::malloc: allocate CMEM physical address
+* 64-bit size: could be allocating a buffer, then accessing smaller subbuffers
+******************************************************************************/
+DSPDevicePtr64 shmem_cmem_ondemand::cmem_malloc(uint64_t size)
+{
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_HEAP;
+ DSPDevicePtr64 addr = CMEM_allocPhys2(0, size, &params);
+ if (!addr)
+ {
+ printf("Failed to allocate space 0x%llx from CMem\n", size);
+ exit(EXIT_FAILURE);
+ }
+ return addr;
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::free: free allocated CMEM physical address
+******************************************************************************/
+void shmem_cmem_ondemand::cmem_free(DSPDevicePtr64 addr)
+{
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_HEAP;
+ CMEM_freePhys(addr, &params);
+}
+
diff --git a/src/core/dsp/shmem.h b/src/core/dsp/shmem.h
new file mode 100644
index 0000000..03504a0
--- /dev/null
+++ b/src/core/dsp/shmem.h
@@ -0,0 +1,134 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdint.h>
+#ifndef _SHMEM_H
+#define _SHMEM_H
+
+extern "C"
+{
+ #include <mpm_transport.h>
+}
+#include "dspmem.h"
+
+/*=============================================================================
+* Abstract class for Shared memory
+*============================================================================*/
+class shmem
+{
+ public:
+ shmem ();
+ virtual ~shmem ();
+ virtual void configure_base(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void configure (DSPDevicePtr64 dsp_addr, uint64_t size) = 0;
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size,
+ bool is_read=false) = 0;
+ virtual void unmap (void* host_addr, uint32_t size,
+ bool is_write=false) = 0;
+ uint32_t page_size ();
+ DSPDevicePtr64 start () { return p_dsp_addr; }
+ int64_t size () { return p_size; }
+
+ protected:
+ DSPDevicePtr64 p_dsp_addr;
+ int64_t p_size;
+ uint32_t p_page_size;
+ int32_t p_mmap_fd;
+ mpm_transport_h p_mpm_transport_handle;
+
+};
+
+/*=============================================================================
+* Peristent implementation of shmem
+*============================================================================*/
+class shmem_persistent : public shmem
+{
+ public:
+ shmem_persistent ();
+ ~shmem_persistent ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ private:
+ void * p_host_addr;
+ void * p_xlate_dsp_to_host_offset;
+};
+
+/*=============================================================================
+* On Demand implementation of shmem
+*============================================================================*/
+class shmem_ondemand : public shmem
+{
+ public:
+ shmem_ondemand ();
+ ~shmem_ondemand ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+};
+
+/*=============================================================================
+* Peristent implementation of shmem using CMem
+*============================================================================*/
+class shmem_cmem_persistent : public shmem
+{
+ public:
+ shmem_cmem_persistent (int cmem_block);
+ ~shmem_cmem_persistent ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ static void cmem_init(DSPDevicePtr64* addr1, uint64_t* size1,
+ DSPDevicePtr* addr2, uint32_t* size2,
+ DSPDevicePtr64* addr3, uint64_t* size3);
+ static void cmem_exit();
+
+ private:
+ void * p_host_addr;
+ int64_t p_xlate_dsp_to_host_offset;
+ int p_cmem_block;
+};
+
+/*=============================================================================
+* Ondemand implementation of shmem using CMem
+*============================================================================*/
+class shmem_cmem_ondemand : public shmem
+{
+ public:
+ shmem_cmem_ondemand () {}
+ ~shmem_cmem_ondemand () {}
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ static DSPDevicePtr64 cmem_malloc(uint64_t size);
+ static void cmem_free (DSPDevicePtr64 addr);
+};
+
+#endif // _SHMEM_H
diff --git a/src/core/dsp/source_cache.h b/src/core/dsp/source_cache.h
new file mode 100644
index 0000000..66b4400
--- /dev/null
+++ b/src/core/dsp/source_cache.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _source_cache_
+#define _source_cache_
+
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/crc.hpp>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <stdint.h>
+#include "u_locks_pthread.h"
+#include "database.h"
+
+class source_cache
+{
+ public:
+ void remember(std::string source)
+ {
+ uint32_t hash = get_crc(source);
+ std::string query("insert into programs(hash, source) values("
+ + boost::lexical_cast<std::string>(hash)
+ + ", \""
+ + source
+ + "\");");
+
+ p_database.query(query.c_str());
+ }
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static source_cache* instance ()
+ {
+ static Mutex Cache_instance_mutex;
+ source_cache* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cache_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ char *user = getenv("USER");
+ tmp = new source_cache("/tmp/opencl_source_" + string(user));
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+
+ private:
+ static source_cache* pInstance;
+ std::string p_dbname;
+ Database p_database;
+
+ private:
+ source_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str())
+ {
+ p_database.query("create table if not exists "
+ "programs(hash integer, source string);");
+ }
+
+ uint32_t get_crc(std::string& my_string)
+ {
+ boost::crc_32_type result;
+ result.process_bytes(my_string.data(), my_string.length());
+ return result.checksum();
+ }
+
+ source_cache(const source_cache&); // copy ctor disallowed
+ source_cache& operator=(const source_cache&); // assignment disallowed
+};
+
+#endif // _source_cache_
+
+
diff --git a/src/core/dsp/u_concurrent_map.h b/src/core/dsp/u_concurrent_map.h
new file mode 100644
index 0000000..014c0b6
--- /dev/null
+++ b/src/core/dsp/u_concurrent_map.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_concurrent_map.h
+* @brief TI implementation class that implements a thread safe map.
+*
+******************************************************************************/
+#ifndef _U_CONCURRENT_MAP_H_
+#define _U_CONCURRENT_MAP_H_
+
+#include <iostream>
+#include <map>
+#include "u_lockable.h"
+
+/**************************************************************************//**
+* @class concurrent_map
+*
+* @brief A thread safe map implementation
+*
+* @details This implementation wraps a standard stl map with some locking
+* capability to make the member functions mutually exclusive
+* regions. In derives from the class Lockable which defines a type
+* Lock that can be used to define a type in a scope. The result will
+* be that the remainder of the scope (or until unlock is called) is a
+* mutex.
+*
+******************************************************************************/
+template<typename I, typename T>
+class concurrent_map : public Lockable
+{
+public:
+ concurrent_map() : M(), num_elements(0) {}
+ ~concurrent_map() {}
+
+ /**********************************************************************//**
+ * @brief Place an object in the map.
+ * @param data is the item to psh on the map
+ ***************************************************************************/
+ void push(I index, T const data)
+ {
+ Lock lock(this);
+ M[index] = data;
+ num_elements++;
+ }
+
+ /**********************************************************************//**
+ * @brief How many elements are in the map.
+ * @returns The number of elements in the map.
+ ***************************************************************************/
+ int size() const
+ {
+ Lock lock(this);
+ return num_elements;
+ }
+
+ /**********************************************************************//**
+ * @brief Determine if the map is empty.
+ * @returns true if the map is empty, otherwise false.
+ ***************************************************************************/
+ bool empty() const
+ {
+ Lock lock(this);
+ return (num_elements == 0);
+ }
+
+ /**********************************************************************//**
+ * @brief Attempt to pop an item off the map.
+ * @param popped_value is an output parameter that contains the object popped
+ * if the map is successfully popped.
+ * @returns true if a value is popped, otherwise false
+ ***************************************************************************/
+ bool try_pop(I idx, T& popped_value)
+ {
+ Lock lock(this);
+ if (num_elements == 0) return false;
+
+ typename std::map<I,T>::iterator it = M.find(idx);
+
+ if (it != M.end())
+ {
+ popped_value = it->second;
+ M.erase (it);
+ num_elements--;
+ return true;
+ }
+
+ return false;
+ }
+
+ void dump()
+ {
+ for (typename std::map<I,T>::const_iterator i = M.begin(); i != M.end(); ++i)
+ std::cout << i->first << " ==> " << i->second << std::endl;
+ }
+
+ /*-------------------------------------------------------------------------
+ * The class's data
+ *------------------------------------------------------------------------*/
+private:
+ std::map<I,T> M; //!< standard stl map
+ int num_elements;
+
+ /*-------------------------------------------------------------------------
+ * Prevent copy construction and assignment
+ *------------------------------------------------------------------------*/
+private:
+ concurrent_map(const concurrent_map&);
+ concurrent_map& operator=(const concurrent_map&);
+};
+
+#endif //_U_CONCURRENT_MAP_H_
diff --git a/src/core/dsp/u_concurrent_stack.h b/src/core/dsp/u_concurrent_stack.h
new file mode 100644
index 0000000..6e9755b
--- /dev/null
+++ b/src/core/dsp/u_concurrent_stack.h
@@ -0,0 +1,124 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_concurrent_stack.h
+* @brief TI implementation class that implements a thread safe stack.
+*
+******************************************************************************/
+#ifndef _U_CONCURRENT_STACK_H_
+#define _U_CONCURRENT_STACK_H_
+
+#include <iostream>
+#include <stack>
+#include "u_lockable.h"
+
+/**************************************************************************//**
+* @class concurrent_stack
+*
+* @brief A thread safe stack implementation
+*
+* @details This implementation wraps a standard stl stack with some locking
+* capability to make the member functions mutually exclusive
+* regions. In derives from the class Lockable which defines a type
+* Lock that can be used to define a type in a scope. The result will
+* be that the remainder of the scope (or until unlock is called) is a
+* mutex.
+*
+******************************************************************************/
+template<typename T>
+class concurrent_stack : public Lockable
+{
+public:
+ concurrent_stack() : S(), num_elements(0) {}
+ ~concurrent_stack() {}
+
+ /**********************************************************************//**
+ * @brief Place an object in the stack.
+ * @param data is the item to psh on the stack
+ ***************************************************************************/
+ void push(T const data)
+ {
+ Lock lock(this);
+ S.push(data);
+ num_elements++;
+ }
+
+ /**********************************************************************//**
+ * @brief How many elements are in the stack.
+ * @returns The number of elements in the stack.
+ ***************************************************************************/
+ int size() const
+ {
+ Lock lock(this);
+ return num_elements;
+ }
+
+ /**********************************************************************//**
+ * @brief Determine if the stack is empty.
+ * @returns true if the stack is empty, otherwise false.
+ ***************************************************************************/
+ bool empty() const
+ {
+ Lock lock(this);
+ return (num_elements == 0);
+ }
+
+ /**********************************************************************//**
+ * @brief Attempt to pop an item off the stack.
+ * @param popped_value is an output parameter that contains the object popped
+ * if the stack is successfully popped.
+ * @returns true if a value is popped, otherwise false
+ ***************************************************************************/
+ bool pop(T& popped_value)
+ {
+ Lock lock(this);
+ if (num_elements == 0) return false;
+
+ popped_value = S.top();
+ S.pop();
+ num_elements--;
+ return true;
+ }
+
+ /*-------------------------------------------------------------------------
+ * The class's data
+ *------------------------------------------------------------------------*/
+private:
+ std::stack<T> S; //!< standard stl stack
+ int num_elements;
+
+ /*-------------------------------------------------------------------------
+ * Prevent copy construction and assignment
+ *------------------------------------------------------------------------*/
+private:
+ concurrent_stack(const concurrent_stack&);
+ concurrent_stack& operator=(const concurrent_stack&);
+};
+
+#endif //_U_CONCURRENT_STACK_H_
diff --git a/src/core/dsp/u_lockable.h b/src/core/dsp/u_lockable.h
new file mode 100644
index 0000000..803197f
--- /dev/null
+++ b/src/core/dsp/u_lockable.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+* The Loki Library
+* Copyright (c) 2001 by Andrei Alexandrescu
+* Copyright (c) 2010-2014, Texas Instruments Incorporated
+*
+* This code accompanies the book:
+* Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design
+* Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+* Permission to use, copy, modify, distribute and sell this software for any
+* purpose is hereby granted without fee, provided that the above copyright
+* notice appear in all copies and that both that copyright notice and this
+* permission notice appear in supporting documentation.
+* The author or Addison-Wesley Longman make no representations about the
+* suitability of this software for any purpose. It is provided "as is"
+* without express or implied warranty.
+******************************************************************************/
+
+/**************************************************************************//**
+*
+* @file u_lockable.h
+*
+* @brief Defines a base class that provides a derived class with a Lock type.
+*
+* @version 1.00.00
+*
+* @note The Locakable class is a modified version of the ObjectLevelLockable
+* class from the LOKI library. The copyright from that library is
+* included at the top of this file.
+*
+******************************************************************************/
+#ifndef _U_LOCKABLE_H_
+#define _U_LOCKABLE_H_
+#include "u_locks_pthread.h"
+
+/**************************************************************************//**
+* @brief used as a base class to give your derived class a Lock type.
+* @details Have a class derive from this class and you can lock member
+* functions of your class by defining a lock like this
+* Lock lock(this);
+******************************************************************************/
+class Lockable
+{
+ public:
+ Lockable() : mutex() {} //!< Default Constructor
+ Lockable(const Lockable&) : mutex() {} //!< Copy Constructor
+ ~Lockable() {} //!< Destructor
+
+ /**********************************************************************//**
+ * @brief The Lock type defined by inheriting from Lockable.
+ **************************************************************************/
+ class Lock
+ {
+ public:
+
+ /*******************************************************************//**
+ * @brief Constructing a Lock object will lock the parent object's mutex
+ ***********************************************************************/
+ explicit Lock(const Lockable* host_) : host(*host_)
+ { host.mutex.Lock(); }
+
+ /*******************************************************************//**
+ * @brief Destructing a Lock object will unlock the parent object's mutex
+ ***********************************************************************/
+ ~Lock() { host.mutex.Unlock(); }
+
+ /*******************************************************************//**
+ * @brief Unlock the parent object's mutex
+ ***********************************************************************/
+ void unlock() { host.mutex.Unlock(); }
+
+ /*******************************************************************//**
+ * @brief Return a raw pointer to the parent object's mutex
+ ***********************************************************************/
+ Mutex* raw() { return &host.mutex; }
+
+ private:
+ const Lockable& host; //!< a pointer back to the parent object
+
+ private: // prevent copy construction and assignment
+ Lock(const Lock&);
+ Lock& operator=(const Lock&);
+ };
+
+ protected:
+ mutable Mutex mutex;
+};
+
+/*-----------------------------------------------------------------------------
+* Can use to turn off locking without chaning client code using Lockable
+*----------------------------------------------------------------------------*/
+class Lockable_off
+{
+ public:
+ Lockable_off() {}
+
+ class Lock
+ {
+ public:
+
+ explicit Lock(const Lockable_off* host_) { }
+ void unlock() { }
+
+ private: // prevent copy construction and assignment
+ Lock(const Lock&);
+ Lock& operator=(const Lock&);
+ };
+};
+
+#endif
diff --git a/src/core/dsp/u_locks_pthread.h b/src/core/dsp/u_locks_pthread.h
new file mode 100644
index 0000000..4663a57
--- /dev/null
+++ b/src/core/dsp/u_locks_pthread.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_locks_pthread.h
+*
+* @brief TI implementation classes for mutual exclusion and locking.
+*
+* @ingroup Utilities
+*
+* @version 1.00.00
+*
+******************************************************************************/
+#ifndef _U_LOCKS_PTHREAD_H_
+#define _U_LOCKS_PTHREAD_H_
+
+#include <pthread.h>
+
+/**************************************************************************//**
+* @brief Simple mutex implemented using the pthreads library
+*
+* @details This mutex is simply a wrapper around a pthread mutex. Two regions
+* of code cannot have the mutex locked at the same time.
+*
+******************************************************************************/
+class Mutex
+{
+ public:
+ Mutex() { pthread_mutex_init (&mutex, 0); } //!< Construct a mutex
+ ~Mutex() { pthread_mutex_destroy(&mutex); } //!< Destruct a mutex
+ void Lock() { pthread_mutex_lock (&mutex); } //!< Lock a mutex
+ void Unlock() { pthread_mutex_unlock (&mutex); } //!< Unlock a mutex
+
+ pthread_mutex_t* raw() { return &mutex; } //!< Return raw ptr to underlying
+
+ private:
+ pthread_mutex_t mutex; //!< The underlying pthread mutex
+
+ private: // prevent copy construction and assignment
+ Mutex(const Mutex &);
+ Mutex & operator = (const Mutex &);
+};
+
+/**************************************************************************//**
+* @brief Simple condition variable implemented using the pthreads library.
+*
+* @details Condition variables are synchronization primitives that enable
+* threads to wait until a particular condition occurs. Condition
+* variables enable threads to atomically release a lock and sleep.
+* Condition variables support operations that "wake one" or
+* "wake all" waiting threads. After a thread is woken, it
+* re-acquires the lock it released when the thread entered the
+* sleeping state.
+*
+******************************************************************************/
+class CondVar
+{
+ public:
+
+ CondVar() { pthread_cond_init (&cond, 0); } //!< Constructor
+ ~CondVar() { pthread_cond_destroy (&cond); } //!< Destructor
+
+ /**********************************************************************//**
+ * @brief Signal 1 of N threads waiting on the condition variable
+ **************************************************************************/
+ void notify_one() { pthread_cond_signal (&cond); }
+
+ /**********************************************************************//**
+ * @brief Signal all N threads waiting on the condition variable
+ **************************************************************************/
+ void notify_all() { pthread_cond_broadcast(&cond); }
+
+ /**********************************************************************//**
+ * @brief Wait on the condition variable and release the passed mutex.
+ **************************************************************************/
+ void wait(Mutex* m) { pthread_cond_wait(&cond, m->raw()); }
+
+ private:
+ pthread_cond_t cond; //!< The underlying pthread condition variable
+
+ private: // prevent copy construction and assignment
+ CondVar(CondVar&);
+ CondVar& operator=(CondVar&);
+};
+
+/**************************************************************************//**
+* @brief Objects of this type lock the remainder of the enclosing scope.
+*
+* @details Declare one of these in a scope and pass a mutex reference and the
+* mutex will be locked for the remainder of the scope. This is a
+* safer way to lock and unlock a mutex, because the mutex will
+* automatically be unlocked when the scope level is exited. This
+* helps prevent an unlocked mutex from occuring during exceptions or
+* forgotten early function returns.
+*
+******************************************************************************/
+class ScopedLock
+{
+ public:
+ ScopedLock(Mutex &m) : mutex(m) { mutex.Lock(); } //!< Constructor
+ ~ScopedLock() { mutex.Unlock(); } //!< Destructor
+
+ private:
+ //mutable
+ Mutex& mutex; //!< The Underlying mutex reference
+
+ private: // prevent copy construction and assignment
+ ScopedLock(const ScopedLock&);
+ ScopedLock& operator=(const ScopedLock&);
+};
+
+#endif
diff --git a/src/core/dsp/utils.h b/src/core/dsp/utils.h
new file mode 100644
index 0000000..f125ebd
--- /dev/null
+++ b/src/core/dsp/utils.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __UTILS_H
+#define __UTILS_H
+
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ * element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ * maximum value and couldn't be further incremented.
+ */
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+ bool overflow = false;
+
+ for (unsigned int i=0; i<dims; ++i)
+ {
+ vec[i] += 1;
+
+ if (vec[i] > maxs[i])
+ {
+ vec[i] = 0;
+ overflow = true;
+ }
+ else
+ {
+ overflow = false;
+ break;
+ }
+ }
+
+ return overflow;
+}
+#endif
diff --git a/src/core/dsp/wga.cpp b/src/core/dsp/wga.cpp
new file mode 100644
index 0000000..8269898
--- /dev/null
+++ b/src/core/dsp/wga.cpp
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "wga.h"
+#include <iostream>
+#include <llvm/Pass.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/InstIterator.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "boost/assign/std/set.hpp"
+#include <stdio.h>
+
+using namespace std;
+using namespace boost::assign;
+
+namespace llvm
+{
+
+/******************************************************************************
+* createTIOpenclWorkGroupAggregation
+******************************************************************************/
+Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode)
+{
+ TIOpenclWorkGroupAggregation *fp = new TIOpenclWorkGroupAggregation(
+ is_pocl_mode);
+ return fp;
+}
+
+/**************************************************************************
+* Constructor
+**************************************************************************/
+TIOpenclWorkGroupAggregation::TIOpenclWorkGroupAggregation(bool pocl_mode) :
+ FunctionPass(ID), is_pocl_mode(pocl_mode)
+{
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) IVPhi[i] = 0;
+}
+
+/**************************************************************************
+* Get index variable
+* 1. Original mode, only one loop inserted: return IVPhi[]
+* 2. pocl mode, multiple loops inserted: return a new LoadInst
+**************************************************************************/
+llvm::Instruction* TIOpenclWorkGroupAggregation::get_IV(Function &F,
+ CallInst *call)
+{
+ llvm::Value *ivx, *ivy, *ivz;
+ Value *arg = call->getArgOperand(0);
+ uint32_t dim = 9999;
+
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ dim = constInt->getSExtValue();
+
+ if (is_pocl_mode)
+ {
+ llvm::GlobalValue *iv;
+ if (dim == 2)
+ iv = F.getParent()->getNamedGlobal("_local_id_z");
+ else if (dim == 1)
+ iv = F.getParent()->getNamedGlobal("_local_id_y");
+ else if (dim == 0)
+ iv = F.getParent()->getNamedGlobal("_local_id_x");
+ if (dim != 9999) return new LoadInst(iv);
+
+ ivx = F.getParent()->getNamedGlobal("_local_id_x");
+ ivy = F.getParent()->getNamedGlobal("_local_id_y");
+ ivz = F.getParent()->getNamedGlobal("_local_id_z");
+ }
+ else
+ {
+ if (dim != 9999) return IVPhi[dim];
+
+ ivx = IVPhi[0];
+ ivy = IVPhi[1];
+ ivz = IVPhi[2];
+ }
+
+ // not constant arg: return (arg == 2) ? ivz : (arg == 1 ? ivy : ivx)
+ Type *Int32 = Type::getInt32Ty(F.getContext());
+ Value *one = ConstantInt::get(Int32, 1);
+ Value *two = ConstantInt::get(Int32, 2);
+ llvm::Value *cyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, two);
+ llvm::Value *syx = SelectInst::Create(cyx, ivy, ivx, "", call);
+ llvm::Value *czyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, one);
+ return SelectInst::Create(czyx, ivz, syx, "", is_pocl_mode ? NULL : call);
+}
+
+/**************************************************************************
+* runOnFunction(Function &F)
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::runOnFunction(Function &F)
+{
+ /*-------------------------------------------------------------------------
+ * Determine how many dimensions are referenced using OpenCL getXXX
+ * functions, and record them all for later rewrite.
+ *------------------------------------------------------------------------*/
+ int dims;
+ if (!is_pocl_mode) dims = findNeededLoopNest(F);
+
+ /*-------------------------------------------------------------------------
+ * Add a loop nest for each dimension referenced that requires a workitem
+ * id.
+ *------------------------------------------------------------------------*/
+ if (!is_pocl_mode) for (int i = 0; i < dims; ++i) add_loop(F, i);
+
+ /*-------------------------------------------------------------------------
+ * rewrite the alloca() generated during pocl llvm work-group aggregation
+ *------------------------------------------------------------------------*/
+ if (is_pocl_mode) rewrite_allocas(F);
+
+ /*-------------------------------------------------------------------------
+ * rewrite the OpenCL getXXX dimension query functions to reference the info
+ * packet for the workgroup. Return true if we modified the function.
+ *------------------------------------------------------------------------*/
+ return rewrite_ocl_funcs(F);
+}
+
+/******************************************************************************
+* getAnalysisUsage(AnalysisUsage &Info) const
+******************************************************************************/
+void TIOpenclWorkGroupAggregation::getAnalysisUsage(AnalysisUsage &Info) const
+{
+ /*-------------------------------------------------------------------------
+ * This will ensure that all returns go through a single exit node, which
+ * our WGA loop generation algorithm depends on.
+ *------------------------------------------------------------------------*/
+ Info.addRequired<UnifyFunctionExitNodes>();
+}
+
+/**************************************************************************
+* findNeededLoopNest(Function &F)
+**************************************************************************/
+unsigned int TIOpenclWorkGroupAggregation::findNeededLoopNest(Function &F)
+{
+ unsigned int maxDim = 0;
+
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if (CallInst * callInst = dyn_cast<CallInst>(&*I))
+ {
+ if (!callInst->getCalledFunction()) continue;
+ string functionName(callInst->getCalledFunction()->getName());
+
+ if (functionName == "get_local_id" ||
+ functionName == "get_global_id")
+ {
+ Value *arg = callInst->getArgOperand(0);
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ {
+ unsigned int dimIdx = constInt->getSExtValue();
+ dimIdx = min(MAX_DIMENSIONS-1, dimIdx);
+ maxDim = max(maxDim, dimIdx + 1);
+ }
+
+ /*-------------------------------------------------------------
+ * if the work group function has a variable argument, then
+ * assume worst case and return 3 loop levels are needed.
+ *------------------------------------------------------------*/
+ else return 3;
+ }
+ }
+
+ return maxDim;
+}
+
+/**************************************************************************
+* createLoadGlobal
+* Create an aligned 32 bit load from a global address.
+**************************************************************************/
+Instruction* TIOpenclWorkGroupAggregation::createLoadGlobal
+ (int32_t idx, Module* M, Instruction *before, const char *name)
+{
+ llvm::ArrayType *type = ArrayType::get(
+ IntegerType::getInt32Ty(getGlobalContext()), 64);
+ llvm::Value* dummy = M->getOrInsertGlobal("kernel_config_l2", type);
+
+ GlobalVariable* global = M->getNamedGlobal("kernel_config_l2");
+
+ std::vector<Value*> indices;
+ indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), 0));
+ indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), idx));
+
+ Constant* gep = ConstantExpr::getInBoundsGetElementPtr (global, indices);
+ LoadInst* ld = new LoadInst(gep, name, before);
+
+ ld->setAlignment(4);
+ return ld;
+}
+
+/******************************************************************************
+* findDim
+******************************************************************************/
+unsigned int TIOpenclWorkGroupAggregation::findDim(class CallInst* call)
+{
+ Value *arg = call->getArgOperand(0);
+
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ return constInt->getSExtValue();
+ return 100; // who knows
+}
+
+/**************************************************************************
+* rewrite allocas to _wg_alloca(sizeinbytes)
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::rewrite_allocas(Function &F)
+{
+ int wi_alloca_size = 0;
+ Module *M = F.getParent();
+ AllocaInst *alloca;
+
+ std::vector<AllocaInst *> allocas;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if ((alloca = dyn_cast<AllocaInst>(&*I)) != NULL)
+ allocas.push_back(alloca);
+ if (allocas.empty()) return false;
+
+ DataLayout dataLayout(M);
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*Params=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *wg_alloca = dyn_cast<Function>(
+ M->getOrInsertFunction("_wg_alloca", ft));
+ Type *Int32 = Type::getInt32Ty(M->getContext());
+
+ for (std::vector<AllocaInst *>::iterator I = allocas.begin();
+ I != allocas.end(); ++I)
+ {
+ alloca = *I;
+
+ // get number of elements, element type size, compute total size
+ Value *numElems = alloca->getArraySize();
+ // YUAN TODO: skip regular constant numElems?
+
+ Type *elementType = alloca->getAllocatedType();
+ // getTypeSizeInBits(), what about uchar3 type?
+ uint64_t esBytes = dataLayout.getTypeStoreSize(elementType);
+ Value *esize = ConstantInt::get(Int32, (uint32_t) esBytes);
+ Instruction *alloca_size = BinaryOperator::Create(
+ Instruction::Mul, esize, numElems, "", alloca);
+ SmallVector<Value *, 4> args;
+ args.push_back(alloca_size);
+
+ // create function call: _wg_alloca(alloca_size)
+ CallInst *f_alloca = CallInst::Create(
+ wg_alloca, ArrayRef<Value *>(args), "", alloca);
+
+ // cast to alloca type
+ Instruction * new_alloca = new IntToPtrInst(
+ f_alloca, alloca->getType());
+
+ // replace AllocaInst with new _wg_alloca()
+ ReplaceInstWithInst(alloca, new_alloca);
+
+ // accumulate element type size
+ unsigned align = dataLayout.getPrefTypeAlignment(elementType);
+ wi_alloca_size = (wi_alloca_size + align - 1) & (~(align-1));
+ wi_alloca_size += esBytes;
+ }
+
+ // initialize _wg_alloca_start and _wg_alloca_size
+ // _wg_alloca_size = load(packetaddr+offset);
+ // _wg_alloca_start = load(packetaddr+offset) + __core_num() * _wg_alloca_size;
+ Instruction *inspt = F.getEntryBlock().getFirstNonPHI();
+ FunctionType *core_num_ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *core_num = dyn_cast<Function>(
+ M->getOrInsertFunction("__core_num", core_num_ft));
+ Instruction *f_core_num = CallInst::Create(core_num, "", inspt);
+
+ Instruction *wg_alloca_size = createLoadGlobal(17, M, inspt);
+
+ Instruction *shift = BinaryOperator::Create(Instruction::Mul, f_core_num,
+ wg_alloca_size, "", inspt);
+
+ Instruction *start = createLoadGlobal(16, M, inspt);
+
+ Instruction *core_start = BinaryOperator::Create(
+ Instruction::Add, start, shift, "", inspt);
+ Value *gv = M->getOrInsertGlobal("_wg_alloca_start", Int32);
+ GlobalVariable *wg_gv = M->getNamedGlobal("_wg_alloca_start");
+ wg_gv->setSection(StringRef("far"));
+ Instruction *store = new StoreInst(core_start, gv, inspt);
+
+ // put total orig_wi_size into attributes data in the function
+ char *s_wi_alloca_size = new char[32]; // we have to leak this
+ snprintf(s_wi_alloca_size, 32, "_wi_alloca_size=%d", wi_alloca_size);
+ F.addFnAttr(StringRef(s_wi_alloca_size));
+
+ return true;
+}
+
+/**************************************************************************
+* rewrite_ocl_funcs
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::rewrite_ocl_funcs(Function &F)
+{
+ CallInst *call;
+ Module *M = F.getParent();
+ std::vector<CallInst *> wi_calls;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ {
+ if ((call = dyn_cast<CallInst>(&*I)) == NULL) continue;
+ if (call->getCalledFunction() == NULL) continue;
+ string name(call->getCalledFunction()->getName());
+ if (name != "get_local_id" && name != "get_local_size") continue;
+ wi_calls.push_back(call);
+ }
+ if (wi_calls.empty()) return false;
+
+ LLVMContext &ctx = F.getContext();
+ std::vector<CallInst *>::iterator I, E;
+ for (I = wi_calls.begin(), E = wi_calls.end(); I != E; ++I)
+ {
+ call = *I;
+ string name(call->getCalledFunction()->getName());
+
+ if (name == "get_local_id")
+ {
+ if (is_pocl_mode)
+ {
+ ReplaceInstWithInst(call, get_IV(F, call));
+ }
+ else
+ {
+ BasicBlock::iterator BI(call);
+ ReplaceInstWithValue(call->getParent()->getInstList(), BI,
+ get_IV(F, call));
+ }
+ }
+ else if (name == "get_local_size")
+ {
+ // remaining get_local_size() are generated by pocl,
+ // arguments guaranteed to be constants: 0, 1, or 2
+ ReplaceInstWithInst(call,
+ createLoadGlobal(4+findDim(call), M));
+ }
+ }
+ return true;
+}
+
+BasicBlock* TIOpenclWorkGroupAggregation::findExitBlock(Function &F)
+{
+ BasicBlock *exit = 0;
+
+ /*-------------------------------------------------------------------------
+ * Find the one block with no successors
+ *------------------------------------------------------------------------*/
+ for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B)
+ if ((*B).getTerminator()->getNumSuccessors() == 0)
+ if (!exit) exit = &(*B);
+ else assert(false);
+
+ /*-------------------------------------------------------------------------
+ * Split the return off into it's own block
+ *------------------------------------------------------------------------*/
+ Instruction *ret = exit->getTerminator();
+
+ if (ret != &exit->front())
+ exit = SplitBlock(exit, ret, this);
+
+ return exit;
+}
+
+/**************************************************************************
+* add_loop(Function &F)
+**************************************************************************/
+void TIOpenclWorkGroupAggregation::add_loop(Function &F, int dimIdx)
+{
+ LLVMContext &ctx = F.getContext();
+ Type *Int32 = Type::getInt32Ty(ctx);
+ Value *zero = ConstantInt::get(Int32, 0);
+ Value *one = ConstantInt::get(Int32, 1);
+ Module *M = F.getParent();
+
+ BasicBlock* exit = findExitBlock(F);
+ BasicBlock* entry = &(F.getEntryBlock());
+
+ BasicBlock* bodytop = SplitBlock(entry, &entry->front(), this);
+ BasicBlock* bodyend = exit;
+ exit = SplitBlock(bodyend, &exit->front(), this);
+
+ exit->setName(".exit");
+ entry->setName(".entry");
+ bodytop->setName(".bodyTop");
+ bodyend->setName(".bodyEnd");
+
+ /*----------------------------------------------------------------------
+ * Populate the branch around
+ *---------------------------------------------------------------------*/
+ Instruction *branch = entry->getTerminator();
+ Instruction *ld_upper_bnd = createLoadGlobal(4+dimIdx, M, branch);
+
+ Instruction *cmp = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SGT,
+ ld_upper_bnd, zero, "", branch);
+
+ Instruction *cbr = BranchInst::Create(bodytop, exit, cmp);
+ ReplaceInstWithInst(branch, cbr);
+
+ /*----------------------------------------------------------------------
+ * Add the phi node to the top of the body
+ *---------------------------------------------------------------------*/
+ PHINode *phi = PHINode::Create(Int32, 0, "", &bodytop->front());
+ phi->addIncoming(zero, entry);
+
+ /*----------------------------------------------------------------------
+ * Add the loop control to the bottom of the bodyend
+ *---------------------------------------------------------------------*/
+ branch = bodyend->getTerminator();
+ Instruction *inc = BinaryOperator::Create(Instruction::Add,
+ phi, one, Twine(), branch);
+
+ Instruction *ld_upper_bnd2 = createLoadGlobal(4+dimIdx, M, branch);
+ Instruction *cmp2 = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SLT,
+ inc, ld_upper_bnd2, "", branch);
+
+ Instruction *cbr2 = BranchInst::Create(bodytop, exit, cmp2);
+ ReplaceInstWithInst(branch, cbr2);
+
+ phi->addIncoming(inc, bodyend);
+ IVPhi[dimIdx] = phi;
+
+ // YUAN TODO: maybe handled better later
+ if (dimIdx < 1) IVPhi[1] = phi;
+ if (dimIdx < 2) IVPhi[2] = phi;
+}
+
+char TIOpenclWorkGroupAggregation::ID = 0;
+static RegisterPass<TIOpenclWorkGroupAggregation>
+ X("wga", "Work Group Aggregation", false, false);
+
+}
diff --git a/src/core/dsp/wga.h b/src/core/dsp/wga.h
new file mode 100644
index 0000000..8728fea
--- /dev/null
+++ b/src/core/dsp/wga.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __TIOPENCLWORKGROUPAGGREGATIONPASS_H
+#define __TIOPENCLWORKGROUPAGGREGATIONPASS_H
+
+#include <string>
+#include <set>
+#include "boost/tuple/tuple.hpp"
+#include <llvm/Pass.h>
+#include <llvm/IR/Instruction.h>
+
+#define MAX_DIMENSIONS 3u
+
+namespace llvm
+{
+
+class TIOpenclWorkGroupAggregation : public FunctionPass
+{
+ public:
+ static char ID;
+
+ TIOpenclWorkGroupAggregation(bool pocl_mode = false);
+ virtual bool runOnFunction(Function &F);
+ virtual void getAnalysisUsage(AnalysisUsage &Info) const;
+
+ private:
+ Instruction* IVPhi[MAX_DIMENSIONS];
+ bool is_pocl_mode;
+
+ private:
+ Instruction* createLoadGlobal(int32_t idx, Module* m, Instruction *before=0,
+ const char *name=0);
+
+ BasicBlock* findExitBlock (Function &F);
+ unsigned int findNeededLoopNest(Function &F);
+ unsigned int findDim (class CallInst* call);
+ bool rewrite_ocl_funcs (Function &F);
+ void add_loop (Function &F, int dimIdx);
+ Instruction* get_IV(Function &F, CallInst *call);
+ bool rewrite_allocas(Function &F);
+};
+
+Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode = false);
+
+}
+
+#endif // __TIOPENCLWORKGROUPAGGREGATIONPASS_H
diff --git a/src/core/dsp/worker.cpp b/src/core/dsp/worker.cpp
new file mode 100644
index 0000000..79223f0
--- /dev/null
+++ b/src/core/dsp/worker.cpp
@@ -0,0 +1,519 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "driver.h"
+
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+
+#include <stdlib.h>
+#include <iostream>
+#include <string.h>
+
+#include "u_locks_pthread.h"
+
+using namespace Coal;
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+/******************************************************************************
+* handle_event_completion
+******************************************************************************/
+void handle_event_completion(DSPDevice *device)
+{
+ int k_id = device->mail_from();
+
+ /*-------------------------------------------------------------------------
+ * If this is a false completion message due to prinft traffic, etc.
+ *------------------------------------------------------------------------*/
+ if (k_id < 0) return;
+
+ Event* event;
+ bool found = device->get_complete_pending(k_id, event);
+ if (!found)
+ {
+ std::cout << "Completion status received for kernel Id " << k_id <<
+ " but no pending event found for that id" << std::endl;
+ exit(-1);
+ }
+
+ KernelEvent *e = (KernelEvent *) event;
+ DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData();
+ ke->free_tmp_bufs();
+
+ CommandQueue *queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus(Event::Complete);
+}
+
+
+/******************************************************************************
+* handle_event_dispatch
+******************************************************************************/
+bool handle_event_dispatch(DSPDevice *device)
+{
+ bool stop = false;
+ cl_int errcode;
+ Event * event;
+
+ event = device->getEvent(stop);
+
+ /*---------------------------------------------------------------------
+ * Ensure we have a good event and we don't have to stop
+ *--------------------------------------------------------------------*/
+ if (stop) return true;
+ if (!event) return false;
+
+ /*---------------------------------------------------------------------
+ * Get info about the event and its command queue
+ *--------------------------------------------------------------------*/
+ Event::Type t = event->type();
+ CommandQueue * queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ errcode = CL_SUCCESS;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Start);
+
+ /*---------------------------------------------------------------------
+ * Execute the action
+ *--------------------------------------------------------------------*/
+ switch (t)
+ {
+ case Event::ReadBuffer:
+ case Event::WriteBuffer:
+ {
+ ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ if (t == Event::ReadBuffer)
+ memcpy(e->ptr(), e->buffer()->host_ptr(), e->cb());
+ else memcpy(e->buffer()->host_ptr(), e->ptr(), e->cb());
+ break;
+ }
+
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset();
+
+ if (t == Event::ReadBuffer)
+ Driver::instance()->read(device->dspID(), data,
+ (uint8_t*)e->ptr(), e->cb());
+
+ else
+ Driver::instance()->write(device->dspID(), data,
+ (uint8_t*)e->ptr(), e->cb());
+
+ break;
+ }
+
+ case Event::CopyBuffer:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#else
+ CopyBufferEvent *e = (CopyBufferEvent *)event;
+
+ DSPDevicePtr64 src_addr;
+ DSPDevicePtr64 dst_addr;
+
+ void *psrc;
+ void *pdst;
+
+ if (e->source()->flags() & CL_MEM_USE_HOST_PTR)
+ psrc = (char*)e->source()->host_ptr() + e->src_offset();
+ else
+ {
+ DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device);
+ src_addr = (DSPDevicePtr64)src->data() + e->src_offset();
+ psrc = Driver::instance()->map(src_addr, e->cb(), true);
+ }
+
+ if (e->destination()->flags() & CL_MEM_USE_HOST_PTR)
+ pdst = (char *)e->destination()->host_ptr() + e->dst_offset();
+ else
+ {
+ DSPBuffer *dst = (DSPBuffer*)e->destination()->deviceBuffer(device);
+ dst_addr = (DSPDevicePtr64)dst->data() + e->dst_offset();
+ pdst = Driver::instance()->map(dst_addr, e->cb(), false);
+ }
+
+ memcpy(pdst, psrc, e->cb());
+
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(psrc, src_addr, e->cb(), false);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(pdst, dst_addr, e->cb(), true);
+#endif
+ break;
+ }
+
+ case Event::ReadBufferRect:
+ case Event::WriteBufferRect:
+ {
+ ReadWriteBufferRectEvent *e = (ReadWriteBufferRectEvent *)event;
+
+ // Calculate the start points for each block of memory referenced
+ DSPDevicePtr64 buf_start;
+ uint8_t * host_start;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ buf_start = (DSPDevicePtr64)e->buffer()->host_ptr();
+ else
+ buf_start = ((DSPBuffer *)e->source()->deviceBuffer(device))
+ ->data();
+
+ buf_start += e->src_origin(2) * e->src_slice_pitch() +
+ e->src_origin(1) * e->src_row_pitch() +
+ e->src_origin(0);
+
+ host_start = (uint8_t *)e->ptr() +
+ e->dst_origin(2) * e->dst_slice_pitch() +
+ e->dst_origin(1) * e->dst_row_pitch() +
+ e->dst_origin(0);
+
+ // Map the device/host buffers to the appopriate src/dst operands
+ // based on the requested operation (read vs write)
+ DSPDevicePtr64 src_start, dst_start;
+
+ size_t src_row_pitch, dst_row_pitch;
+ size_t src_slice_pitch, dst_slice_pitch;
+
+ if (t == Event::ReadBufferRect)
+ {
+ src_start = buf_start;
+ src_row_pitch = e->src_row_pitch();
+ src_slice_pitch = e->src_slice_pitch();
+
+ dst_start = (DSPDevicePtr64) host_start;
+ dst_row_pitch = e->dst_row_pitch();
+ dst_slice_pitch = e->dst_slice_pitch();
+ }
+ else
+ {
+ src_start = (DSPDevicePtr64) host_start;
+ src_row_pitch = e->dst_row_pitch();
+ src_slice_pitch = e->dst_slice_pitch();
+
+ dst_start = buf_start;
+ dst_row_pitch = e->src_row_pitch();
+ dst_slice_pitch = e->src_slice_pitch();
+ }
+
+ // The dimensions of the region to be copied gives us our
+ // loop boundaries for copying
+ cl_ulong xdim = e->region(0);
+ cl_ulong ydim = e->region(1);
+ cl_ulong zdim = e->region(2);
+
+ // Set up the start point
+ DSPDevicePtr64 src_cur_slice = src_start;
+ DSPDevicePtr64 dst_cur_slice = dst_start;
+
+ // The outer loop handles each z-axis slice
+ // For 2-D copy, will only iterate once (zdim=1)
+ for(cl_uint z = 0; z < zdim; z++)
+ {
+ DSPDevicePtr64 src_cur_row = src_cur_slice;
+ DSPDevicePtr64 dst_cur_row = dst_cur_slice;
+
+ // The inner loop handles each row of the current slice
+ for(cl_uint y = 0; y < ydim; y++)
+ {
+ // Copy a row
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ memcpy((void *)dst_cur_row, (void *)src_cur_row, xdim);
+ else
+ {
+ if (t == Event::ReadBufferRect)
+ Driver::instance()->read(device->dspID(),
+ src_cur_row, (uint8_t *)dst_cur_row, xdim);
+ else
+ Driver::instance()->write(device->dspID(),
+ dst_cur_row, (uint8_t *)src_cur_row, xdim);
+ }
+
+ // Proceed to next row
+ src_cur_row += src_row_pitch;
+ dst_cur_row += dst_row_pitch;
+ }
+
+ // Proceed to next slice
+ src_cur_slice += src_slice_pitch;
+ dst_cur_slice += dst_slice_pitch;
+ }
+ break;
+ }
+
+ case Event::CopyBufferRect:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#else
+ CopyBufferRectEvent *e = (CopyBufferRectEvent *)event;
+
+ // Calculate the offsets into each buffer
+ size_t src_offset, dst_offset;
+
+ src_offset = e->src_origin(2) * e->src_slice_pitch() +
+ e->src_origin(1) * e->src_row_pitch() +
+ e->src_origin(0);
+
+ dst_offset = e->dst_origin(2) * e->dst_slice_pitch() +
+ e->dst_origin(1) * e->dst_row_pitch() +
+ e->dst_origin(0);
+
+ // Set up start points for the copy. If it is a DSP buffer, we'll
+ // need to map the buffer before copying (done in copy loop below)
+ DSPDevicePtr64 src_start, dst_start;
+
+ if (e->source()->flags() & CL_MEM_USE_HOST_PTR)
+ src_start = (DSPDevicePtr64)e->source()->host_ptr() + src_offset;
+ else
+ {
+ DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device);
+ src_start = src->data() + src_offset;
+ }
+
+ if (e->destination()->flags() & CL_MEM_USE_HOST_PTR)
+ dst_start = (DSPDevicePtr64)e->destination()->host_ptr() + dst_offset;
+ else
+ {
+ DSPBuffer *dst=(DSPBuffer*)e->destination()->deviceBuffer(device);
+ dst_start = dst->data() + dst_offset;
+ }
+
+ // The dimensions of the region to be copied
+ cl_ulong xdim = e->region(0);
+ cl_ulong ydim = e->region(1);
+ cl_ulong zdim = e->region(2);
+
+ // If we need to map memory we will currently map a slice
+ // at a time. So determine the size of a 2D slice
+ size_t src_slice_size = ydim * e->src_row_pitch()-e->src_origin(0);
+ size_t dst_slice_size = ydim * e->dst_row_pitch()-e->dst_origin(0);
+
+ // Set up the initial copy point
+ DSPDevicePtr64 src_cur_slice = src_start;
+ DSPDevicePtr64 dst_cur_slice = dst_start;
+
+ // The outer loop handles each z-axis slice
+ // For 2-D copy, will only iterate once (zdim=1)
+ for(cl_ulong z = 0; z < zdim; z++)
+ {
+ uint8_t *src_cur_row = (uint8_t *)src_cur_slice;
+ uint8_t *dst_cur_row = (uint8_t *)dst_cur_slice;
+ uint8_t *src_cur_mslice, *dst_cur_mslice;
+
+ // If necessary, memory map a slice of buffer
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ src_cur_row = src_cur_mslice = (uint8_t *)
+ Driver::instance()->map(src_cur_slice, src_slice_size,true);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ dst_cur_row = dst_cur_mslice = (uint8_t *)
+ Driver::instance()->map(dst_cur_slice, dst_slice_size,false);
+
+ // The inner loop handles each row of the current slice
+ for(cl_ulong y = 0; y < ydim; y++)
+ {
+ // Copy current row
+ memcpy(dst_cur_row, src_cur_row, xdim);
+
+ // Proceed to next row
+ src_cur_row += e->src_row_pitch();
+ dst_cur_row += e->dst_row_pitch();
+ }
+
+ // If necessary, unmap the current slice
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(src_cur_mslice, src_cur_slice,
+ src_slice_size, false);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(dst_cur_mslice, dst_cur_slice,
+ dst_slice_size, true);
+
+ // Proceed to next slice
+ src_cur_slice += e->src_slice_pitch();
+ dst_cur_slice += e->dst_slice_pitch();
+ }
+#endif
+ break;
+ }
+
+ case Event::ReadImage:
+ case Event::WriteImage:
+ case Event::CopyImage:
+ case Event::CopyBufferToImage:
+ case Event::CopyImageToBuffer:
+ case Event::MapImage:
+ {
+ std::cerr << "Images are not supported" << std::endl;
+ break;
+ }
+
+ case Event::MapBuffer:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#endif
+ MapBufferEvent *e = (MapBufferEvent *)event;
+
+ /*-----------------------------------------------------------
+ * for USE_HOST_PTR, the buffer store is already on the host and
+ * map should not be needed.
+ -----------------------------------------------------------*/
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break;
+
+ clRetainEvent((cl_event) e);
+ if(! e->buffer()->addMapEvent(e))
+ ERR(1, "MapBuffer: Range conflicts with previous maps");
+ if ((e->flags() & CL_MAP_READ) != 0)
+ {
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset();
+ Driver::instance()->map(data, e->cb(), true);
+ }
+ break;
+ }
+ case Event::UnmapMemObject:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#endif
+ UnmapBufferEvent *e = (UnmapBufferEvent *)event;
+
+ /*-----------------------------------------------------------
+ * for USE_HOST_PTR, the buffer store is already on the host and
+ * unmap should not be needed.
+ -----------------------------------------------------------*/
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break;
+
+ if (e->buffer()->type() != Coal::MemObject::Buffer &&
+ e->buffer()->type() != Coal::MemObject::SubBuffer)
+ ERR(1, "UnmapMemObject: MapImage/Unmap not support yet");
+ MapBufferEvent *mbe = (MapBufferEvent *)
+ e->buffer()->removeMapEvent(e->mapping());
+ if (mbe == NULL)
+ ERR(1, "UnmapMemObject: host_ptr not from previous maps");
+
+ if ((mbe->flags() & CL_MAP_WRITE) != 0)
+ {
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 buf_dsp_addr = (DSPDevicePtr64)buf->data();
+ Driver::instance()->unmap(e->mapping(), buf_dsp_addr,
+ mbe->cb(), true);
+ }
+ if (queue) queue->releaseEvent(mbe);
+ break;
+ }
+
+ case Event::NativeKernel:
+ {
+ std::cerr << "Native Kernels not supported on the DSP" << std::endl;
+ break;
+ }
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *) event;
+ DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData();
+
+ errcode = ke->run(t);
+
+ /*-----------------------------------------------------------------
+ * Put the event on a pending completion list and its
+ * completion will be handled asynchronously.
+ *----------------------------------------------------------------*/
+ if (errcode == CL_SUCCESS)
+ {
+ device->push_complete_pending(ke->kernel_id(), e);
+ return false;
+ }
+ break;
+ }
+ default: break;
+ }
+
+ /*---------------------------------------------------------------------
+ * Cleanup
+ *--------------------------------------------------------------------*/
+
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus((errcode == CL_SUCCESS) ? Event::Complete :
+ (Event::Status)errcode);
+
+ return false;
+}
+
+/******************************************************************************
+* dsp_worker
+******************************************************************************/
+void *dsp_worker(void *data)
+{
+ DSPDevice *device = (DSPDevice *)data;
+
+ while (true)
+ {
+ if (device->any_complete_pending() && device->mail_query())
+ handle_event_completion(device);
+
+ bool stop = device->stop();
+
+ if (!stop && device->availableEvent())
+ stop |= handle_event_dispatch(device);
+
+ if (stop && !device->any_complete_pending()) break;
+ }
+}
diff --git a/src/core/events.cpp b/src/core/events.cpp
new file mode 100644
index 0000000..629a0c9
--- /dev/null
+++ b/src/core/events.cpp
@@ -0,0 +1,1519 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file events.cpp
+ * \brief Events inheriting \c Coal::Event
+ */
+
+#include "events.h"
+#include "commandqueue.h"
+#include "memobject.h"
+#include "kernel.h"
+#include "deviceinterface.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+/*
+ * Read/Write buffers
+ */
+
+BufferEvent::BufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_buffer(buffer)
+{
+ clRetainMemObject((cl_mem) p_buffer);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Correct buffer
+ if (!buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Buffer's context must match the CommandQueue one
+ Context *ctx = 0;
+ *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &ctx, 0);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if ((Context *)buffer->parent() != ctx)
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return;
+ }
+
+ // Alignment of SubBuffers
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(buffer, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!buffer->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+BufferEvent::~BufferEvent()
+{
+ clReleaseMemObject((cl_mem) p_buffer);
+}
+
+MemObject *BufferEvent::buffer() const
+{
+ return p_buffer;
+}
+
+bool BufferEvent::isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device)
+{
+ cl_uint align;
+ cl_int rs;
+
+ if (buffer->type() != MemObject::SubBuffer)
+ return true;
+
+ rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
+ &align, 0);
+
+ if (rs != CL_SUCCESS)
+ return false;
+
+ size_t mask = 0;
+ if (align != 0) mask = align - 1;
+
+ if (((SubBuffer *)buffer)->offset() & mask)
+ return false;
+
+ return true;
+}
+
+ReadWriteBufferEvent::ReadWriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_offset(offset), p_cb(cb), p_ptr(ptr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for out-of-bounds reads
+ if (!ptr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (offset + cb > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t ReadWriteBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+size_t ReadWriteBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+void *ReadWriteBufferEvent::ptr() const
+{
+ return p_ptr;
+}
+
+ReadBufferEvent::ReadBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{}
+
+Event::Type ReadBufferEvent::type() const
+{
+ return Event::ReadBuffer;
+}
+
+WriteBufferEvent::WriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{}
+
+Event::Type WriteBufferEvent::type() const
+{
+ return Event::WriteBuffer;
+}
+
+MapBufferEvent::MapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ cl_map_flags map_flags,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_offset(offset), p_cb(cb), p_map_flags(map_flags)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check flags
+ if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds values
+ if (offset + cb > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type MapBufferEvent::type() const
+{
+ return Event::MapBuffer;
+}
+
+size_t MapBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+size_t MapBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+cl_map_flags MapBufferEvent::flags() const
+{
+ return p_map_flags;
+}
+
+void *MapBufferEvent::ptr() const
+{
+ return p_ptr;
+}
+
+void MapBufferEvent::setPtr(void *ptr)
+{
+ p_ptr = ptr;
+}
+
+MapImageEvent::MapImageEvent(CommandQueue *parent,
+ Image2D *image,
+ cl_map_flags map_flags,
+ const size_t origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent (parent, image, num_events_in_wait_list, event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check flags
+ if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Copy the vectors
+ if (origin)
+ std::memcpy(&p_origin, origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_origin, 0, 3 * sizeof(size_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if (!region[i])
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_region[i] = region[i];
+ }
+
+ // Multiply the elements (for images)
+ p_region[0] *= image->pixel_size();
+ p_origin[0] *= image->pixel_size();
+
+ // Check for overflow
+ if (image->type() == MemObject::Image2D &&
+ (origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_origin[0] + p_region[0]) > image->row_pitch() ||
+ (p_origin[1] + p_region[1]) * image->row_pitch() > image->slice_pitch() ||
+ (p_origin[2] + p_region[2]) * image->slice_pitch() > image->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type MapImageEvent::type() const
+{
+ return Event::MapImage;
+}
+
+
+cl_map_flags MapImageEvent::flags() const
+{
+ return p_map_flags;
+}
+
+size_t MapImageEvent::origin (unsigned int index) const
+{
+ return p_origin[index];
+}
+
+size_t MapImageEvent::region (unsigned int index) const
+{
+ return p_region[index];
+}
+
+size_t MapImageEvent::row_pitch() const
+{
+ return p_row_pitch;
+}
+
+size_t MapImageEvent::slice_pitch() const
+{
+ return p_slice_pitch;
+}
+
+void *MapImageEvent::ptr() const
+{
+ return p_ptr;
+}
+
+void MapImageEvent::setRowPitch (size_t row_pitch)
+{
+ p_row_pitch = row_pitch;
+}
+
+void MapImageEvent::setSlicePitch (size_t slice_pitch)
+{
+ p_slice_pitch = slice_pitch;
+}
+
+void MapImageEvent::setPtr (void *ptr)
+{
+ p_ptr = ptr;
+}
+
+UnmapBufferEvent::UnmapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ void *mapped_addr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_mapping(mapped_addr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // TODO: Check that p_mapping is ok (will be done in the drivers)
+ if (!mapped_addr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type UnmapBufferEvent::type() const
+{
+ return Event::UnmapMemObject;
+}
+
+void *UnmapBufferEvent::mapping() const
+{
+ return p_mapping;
+}
+
+CopyBufferEvent::CopyBufferEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, source, num_events_in_wait_list, event_wait_list,
+ errcode_ret), p_destination(destination), p_src_offset(src_offset),
+ p_dst_offset(dst_offset), p_cb(cb)
+{
+ clRetainMemObject((cl_mem) p_destination);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!destination)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if (src_offset + cb > source->size() ||
+ dst_offset + cb > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for overlap
+ if (source == destination)
+ {
+ if ((src_offset < dst_offset && src_offset + cb > dst_offset) ||
+ (dst_offset < src_offset && dst_offset + cb > src_offset))
+ {
+ *errcode_ret = CL_MEM_COPY_OVERLAP;
+ return;
+ }
+ }
+
+ // Check alignement of destination
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(destination, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!destination->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+CopyBufferEvent::~CopyBufferEvent()
+{
+ clReleaseMemObject((cl_mem) p_destination);
+}
+
+MemObject *CopyBufferEvent::source() const
+{
+ return buffer();
+}
+
+MemObject *CopyBufferEvent::destination() const
+{
+ return p_destination;
+}
+
+size_t CopyBufferEvent::src_offset() const
+{
+ return p_src_offset;
+}
+
+size_t CopyBufferEvent::dst_offset() const
+{
+ return p_dst_offset;
+}
+
+size_t CopyBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+Event::Type CopyBufferEvent::type() const
+{
+ return Event::CopyBuffer;
+}
+
+/*
+ * Native kernel
+ */
+NativeKernelEvent::NativeKernelEvent(CommandQueue *parent,
+ void (*user_func)(void *),
+ void *args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const MemObject **mem_list,
+ const void **args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event (parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_user_func((void *)user_func), p_args(0)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Parameters sanity
+ if (!user_func)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (!args && (cb_args || num_mem_objects))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (args && !cb_args)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (num_mem_objects && (!mem_list || !args_mem_loc))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (!num_mem_objects && (mem_list || args_mem_loc))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check that the device can execute a native kernel
+ DeviceInterface *device;
+ cl_device_exec_capabilities caps;
+
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ *errcode_ret = device->info(CL_DEVICE_EXECUTION_CAPABILITIES,
+ sizeof(cl_device_exec_capabilities), &caps, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if ((caps & CL_EXEC_NATIVE_KERNEL) == 0)
+ {
+ *errcode_ret = CL_INVALID_OPERATION;
+ return;
+ }
+
+ // Copy the arguments in a new list
+ if (cb_args)
+ {
+ p_args = std::malloc(cb_args);
+
+ if (!p_args)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ std::memcpy((void *)p_args, (void *)args, cb_args);
+
+ // Replace memory objects with global pointers
+ for (cl_uint i=0; i<num_mem_objects; ++i)
+ {
+ const MemObject *buffer = mem_list[i];
+ const char *loc = (const char *)args_mem_loc[i];
+
+ if (!buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // We need to do relocation : loc is in args, we need it in p_args
+ size_t delta = (char *)p_args - (char *)args;
+ loc += delta;
+
+ *(void **)loc = buffer->deviceBuffer(device)->nativeGlobalPointer();
+ }
+ }
+}
+
+NativeKernelEvent::~NativeKernelEvent()
+{
+ if (p_args)
+ std::free((void *)p_args);
+}
+
+Event::Type NativeKernelEvent::type() const
+{
+ return Event::NativeKernel;
+}
+
+void *NativeKernelEvent::function() const
+{
+ return p_user_func;
+}
+
+void *NativeKernelEvent::args() const
+{
+ return p_args;
+}
+
+/*
+ * Kernel event
+ */
+KernelEvent::KernelEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_work_dim(work_dim), p_kernel(kernel)
+{
+ clRetainKernel((cl_kernel) p_kernel);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ *errcode_ret = CL_SUCCESS;
+
+ // Sanity checks
+ if (!kernel)
+ {
+ *errcode_ret = CL_INVALID_KERNEL;
+ return;
+ }
+
+ // Check that the kernel was built for parent's device.
+ DeviceInterface *device;
+ Context *k_ctx, *q_ctx;
+ size_t max_work_group_size;
+ cl_uint max_dims = 0;
+
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &q_ctx, 0);
+ *errcode_ret |= kernel->info(CL_KERNEL_CONTEXT, sizeof(Context *), &k_ctx, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+ &max_work_group_size, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(size_t),
+ &max_dims, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES,
+ max_dims * sizeof(size_t), p_max_work_item_sizes, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ p_dev_kernel = kernel->deviceDependentKernel(device);
+
+ if (!p_dev_kernel)
+ {
+ *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE;
+ return;
+ }
+
+ // Check that contexts match
+ if (k_ctx != q_ctx)
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return;
+ }
+
+ // Check args
+ if (!kernel->argsSpecified())
+ {
+ *errcode_ret = CL_INVALID_KERNEL_ARGS;
+ return;
+ }
+
+ // Check dimension
+ if (work_dim == 0 || work_dim > max_dims)
+ {
+ *errcode_ret = CL_INVALID_WORK_DIMENSION;
+ return;
+ }
+
+ // Populate work_offset, work_size and local_work_size
+ size_t work_group_size = 1;
+ boost::tuple <uint,uint,uint> reqd_work_group_size(
+ kernel->reqdWorkGroupSize(kernel->deviceDependentModule(device)));
+
+ uint reqd_x = reqd_work_group_size.get<0>();
+ uint reqd_y = reqd_work_group_size.get<1>();
+ uint reqd_z = reqd_work_group_size.get<2>();
+ bool reqd_any = reqd_x > 0 || reqd_y > 0 || reqd_z > 0;
+
+ if (reqd_any)
+ {
+ // if __attribute__((reqd_work_group_size(X, Y, Z))) is set and local size not specified
+ if (!local_work_size)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // if __attribute__((reqd_work_group_size(X, Y, Z))) doesn't match
+ else
+ {
+ if (( local_work_size[0] != reqd_x) ||
+ (work_dim > 1 && local_work_size[1] != reqd_y) ||
+ (work_dim > 2 && local_work_size[2] != reqd_z))
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+ }
+ }
+
+ cl_uint i;
+ for (i=0; i<work_dim; ++i)
+ {
+ if (global_work_offset)
+ {
+ p_global_work_offset[i] = global_work_offset[i];
+ }
+ else
+ {
+ p_global_work_offset[i] = 0;
+ }
+
+ if (!global_work_size || !global_work_size[i])
+ {
+ *errcode_ret = CL_INVALID_GLOBAL_WORK_SIZE;
+ }
+ p_global_work_size[i] = global_work_size[i];
+
+ if (!local_work_size)
+ {
+ // Guess the best value according to the device
+ p_local_work_size[i] =
+ p_dev_kernel->guessWorkGroupSize(work_dim, i, global_work_size[i]);
+ }
+ else
+ {
+ // Check divisibility
+ if ((global_work_size[i] % local_work_size[i]) != 0)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // Not too big ?
+ if (local_work_size[i] > p_max_work_item_sizes[i])
+ {
+ *errcode_ret = CL_INVALID_WORK_ITEM_SIZE;
+ return;
+ }
+
+ p_local_work_size[i] = local_work_size[i];
+ work_group_size *= local_work_size[i];
+ }
+ }
+ // initialize missing dimensions
+ for (; i < max_dims; i++)
+ {
+ p_global_work_offset[i] = 0;
+ p_global_work_size[i] = 1;
+ p_local_work_size[i] = 1;
+ }
+
+ // Check we don't ask too much to the device
+ if (work_group_size > max_work_group_size)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // Check arguments (buffer alignment, image size, ...)
+ for (unsigned int i=0; i<kernel->numArgs(); ++i)
+ {
+ const Kernel::Arg *a = kernel->arg(i);
+
+ if (a->kind() == Kernel::Arg::Buffer && a->file() != Kernel::Arg::Local)
+ {
+ const MemObject *buffer = *(const MemObject **)(a->value(0));
+
+ if (!BufferEvent::isSubBufferAligned(buffer, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+ }
+ else if (a->kind() == Kernel::Arg::Image2D)
+ {
+ const Image2D *image = *(const Image2D **)(a->value(0));
+ size_t maxWidth, maxHeight;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ else if (a->kind() == Kernel::Arg::Image3D)
+ {
+ const Image3D *image = *(const Image3D **)a->value(0);
+ size_t maxWidth, maxHeight, maxDepth;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH,
+ sizeof(size_t), &maxDepth, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight ||
+ image->depth() > maxDepth)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ }
+}
+
+KernelEvent::~KernelEvent()
+{
+ clReleaseKernel((cl_kernel) p_kernel);
+}
+
+cl_uint KernelEvent::work_dim() const
+{
+ return p_work_dim;
+}
+
+size_t KernelEvent::global_work_offset(cl_uint dim) const
+{
+ return p_global_work_offset[dim];
+}
+
+size_t KernelEvent::global_work_size(cl_uint dim) const
+{
+ return p_global_work_size[dim];
+}
+
+size_t KernelEvent::local_work_size(cl_uint dim) const
+{
+ return p_local_work_size[dim];
+}
+
+Kernel *KernelEvent::kernel() const
+{
+ return p_kernel;
+}
+
+DeviceKernel *KernelEvent::deviceKernel() const
+{
+ return p_dev_kernel;
+}
+
+Event::Type KernelEvent::type() const
+{
+ return Event::NDRangeKernel;
+}
+
+static size_t one = 1;
+
+TaskEvent::TaskEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: KernelEvent(parent, kernel, 1, 0, &one, &one, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ // TODO: CL_INVALID_WORK_GROUP_SIZE if
+ // __attribute__((reqd_work_group_size(X, Y, Z))) != (1, 1, 1)
+}
+
+Event::Type TaskEvent::type() const
+{
+ return Event::TaskKernel;
+}
+
+/*
+ * User event
+ */
+UserEvent::UserEvent(Context *context, cl_int *errcode_ret)
+: Event(0, Submitted, 0, 0, errcode_ret), p_context(context)
+{}
+
+Event::Type UserEvent::type() const
+{
+ return Event::User;
+}
+
+Context *UserEvent::context() const
+{
+ return p_context;
+}
+
+/*
+ * ReadWriteBufferRectEvent
+ */
+ReadWriteCopyBufferRectEvent::ReadWriteCopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent (parent, source, num_events_in_wait_list, event_wait_list,
+ errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Copy the vectors
+ if (src_origin)
+ std::memcpy(&p_src_origin, src_origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_src_origin, 0, 3 * sizeof(size_t));
+
+ if (dst_origin)
+ std::memcpy(&p_dst_origin, dst_origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_dst_origin, 0, 3 * sizeof(size_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if (!region[i])
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_region[i] = region[i];
+ }
+
+ // Multiply the elements (for images)
+ p_region[0] *= bytes_per_element;
+ p_src_origin[0] *= bytes_per_element;
+ p_dst_origin[0] *= bytes_per_element;
+
+ // Compute the pitches
+ p_src_row_pitch = p_region[0];
+
+ if (src_row_pitch)
+ {
+ if (src_row_pitch < p_src_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_src_row_pitch = src_row_pitch;
+ }
+
+ p_src_slice_pitch = p_region[1] * p_src_row_pitch;
+
+ if (src_slice_pitch)
+ {
+ if (src_slice_pitch < p_src_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_src_slice_pitch = src_slice_pitch;
+ }
+
+ p_dst_row_pitch = p_region[0];
+
+ if (dst_row_pitch)
+ {
+ if (dst_row_pitch < p_dst_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_dst_row_pitch = dst_row_pitch;
+ }
+
+ p_dst_slice_pitch = p_region[1] * p_dst_row_pitch;
+
+ if (dst_slice_pitch)
+ {
+ if (dst_slice_pitch < p_dst_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_dst_slice_pitch = dst_slice_pitch;
+ }
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_origin(unsigned int index) const
+{
+ return p_src_origin[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_origin(unsigned int index) const
+{
+ return p_dst_origin[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::region(unsigned int index) const
+{
+ return p_region[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_row_pitch() const
+{
+ return p_src_row_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_slice_pitch() const
+{
+ return p_src_slice_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_row_pitch() const
+{
+ return p_dst_row_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_slice_pitch() const
+{
+ return p_dst_slice_pitch;
+}
+
+MemObject *ReadWriteCopyBufferRectEvent::source() const
+{
+ return buffer();
+}
+
+CopyBufferRectEvent::CopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteCopyBufferRectEvent(parent, source, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch,
+ dst_slice_pitch, bytes_per_element,
+ num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_destination(destination)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!destination)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch ||
+ (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch ||
+ (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > source->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((p_dst_origin[0] + p_region[0]) > p_dst_row_pitch ||
+ (p_dst_origin[1] + p_region[1]) * p_dst_row_pitch > p_dst_slice_pitch ||
+ (p_dst_origin[2] + p_region[2]) * p_dst_slice_pitch > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for overlapping
+ if (source == destination)
+ {
+ unsigned char overlapping_dimensions = 0;
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if ((p_dst_origin[i] < p_src_origin[i] && p_dst_origin[i] + p_region[i] > p_src_origin[i]) ||
+ (p_src_origin[i] < p_dst_origin[i] && p_src_origin[i] + p_region[i] > p_dst_origin[i]))
+ overlapping_dimensions++;
+ }
+
+ if (overlapping_dimensions == 3)
+ {
+ // If all the dimensions are overlapping, the region is overlapping
+ *errcode_ret = CL_MEM_COPY_OVERLAP;
+ return;
+ }
+ }
+
+ // Check alignment of destination (source already checked by BufferEvent)
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(destination, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!destination->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+Event::Type CopyBufferRectEvent::type() const
+{
+ return Event::CopyBufferRect;
+}
+
+MemObject *CopyBufferRectEvent::destination() const
+{
+ return p_destination;
+}
+
+ReadWriteBufferRectEvent::ReadWriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteCopyBufferRectEvent(parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, bytes_per_element,
+ num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_ptr(ptr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!ptr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch ||
+ (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch ||
+ (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+void *ReadWriteBufferRectEvent::ptr() const
+{
+ return p_ptr;
+}
+
+ReadBufferRectEvent::ReadBufferRectEvent (CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent(parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
+ host_slice_pitch, ptr, 1, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+}
+
+Event::Type ReadBufferRectEvent::type() const
+{
+ return ReadBufferRect;
+}
+
+WriteBufferRectEvent::WriteBufferRectEvent (CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent (parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
+ host_slice_pitch, ptr, 1, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+}
+
+Event::Type WriteBufferRectEvent::type() const
+{
+ return WriteBufferRect;
+}
+
+ReadWriteImageEvent::ReadWriteImageEvent (CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent(parent, image, origin, 0, region, image->row_pitch(),
+ image->slice_pitch(), row_pitch, slice_pitch, ptr,
+ image->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (image->type() == MemObject::Image2D &&
+ (origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+ReadImageEvent::ReadImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteImageEvent(parent, image, origin, region, row_pitch, slice_pitch, ptr,
+ num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type ReadImageEvent::type() const
+{
+ return Event::ReadImage;
+}
+
+WriteImageEvent::WriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteImageEvent (parent, image, origin, region, row_pitch, slice_pitch, ptr,
+ num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type WriteImageEvent::type() const
+{
+ return Event::WriteImage;
+}
+
+static bool operator!=(const cl_image_format &a, const cl_image_format &b)
+{
+ return (a.image_channel_data_type != b.image_channel_data_type) ||
+ (a.image_channel_order != b.image_channel_order);
+}
+
+CopyImageEvent::CopyImageEvent(CommandQueue *parent,
+ Image2D *source,
+ Image2D *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent (parent, source, destination, src_origin, dst_origin,
+ region, source->row_pitch(), source->slice_pitch(),
+ destination->row_pitch(), destination->slice_pitch(),
+ source->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check bounds
+ if (source->type() == MemObject::Image2D &&
+ (src_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (destination->type() == MemObject::Image2D &&
+ (dst_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Formats must match
+ if (source->format() != destination->format())
+ {
+ *errcode_ret = CL_IMAGE_FORMAT_MISMATCH;
+ return;
+ }
+}
+
+Event::Type CopyImageEvent::type() const
+{
+ return Event::CopyImage;
+}
+
+CopyImageToBufferEvent::CopyImageToBufferEvent(CommandQueue *parent,
+ Image2D *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t region[3],
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent(parent, source, destination, src_origin, 0, region,
+ source->row_pitch(), source->slice_pitch(), 0, 0,
+ source->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret),
+ p_offset(dst_offset)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for buffer overflow
+ size_t dst_cb = region[2] * region[1] * region[0] * source->pixel_size();
+
+ if (dst_offset + dst_cb > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check validity
+ if (source->type() == MemObject::Image2D &&
+ (src_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t CopyImageToBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+Event::Type CopyImageToBufferEvent::type() const
+{
+ return Event::CopyImageToBuffer;
+}
+
+CopyBufferToImageEvent::CopyBufferToImageEvent(CommandQueue *parent,
+ MemObject *source,
+ Image2D *destination,
+ size_t src_offset,
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent(parent, source, destination, 0, dst_origin, region, 0, 0,
+ destination->row_pitch(), destination->slice_pitch(),
+ destination->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret),
+ p_offset(src_offset)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for buffer overflow
+ size_t src_cb = region[2] * region[1] * region[0] * destination->pixel_size();
+
+ if (src_offset + src_cb > source->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check validity
+ if (destination->type() == MemObject::Image2D &&
+ (dst_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t CopyBufferToImageEvent::offset() const
+{
+ return p_offset;
+}
+
+Event::Type CopyBufferToImageEvent::type() const
+{
+ return Event::CopyBufferToImage;
+}
+
+/*
+ * Barrier
+ */
+
+BarrierEvent::BarrierEvent(CommandQueue *parent, cl_int *errcode_ret)
+: Event(parent, Queued, 0, 0, errcode_ret)
+{}
+
+Event::Type BarrierEvent::type() const
+{
+ return Event::Barrier;
+}
+
+/*
+ * WaitForEvents
+ */
+
+WaitForEventsEvent::WaitForEventsEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type WaitForEventsEvent::type() const
+{
+ return Event::WaitForEvents;
+}
+
+/*
+ * Marker
+ */
+MarkerEvent::MarkerEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: WaitForEventsEvent(parent, num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type MarkerEvent::type() const
+{
+ return Event::Marker;
+}
diff --git a/src/core/events.h b/src/core/events.h
new file mode 100644
index 0000000..2311d92
--- /dev/null
+++ b/src/core/events.h
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file events.h
+ * \brief All the event-related classes
+ */
+
+#ifndef __EVENTS_H__
+#define __EVENTS_H__
+
+#include "commandqueue.h"
+#include <core/config.h>
+
+#include <vector>
+
+namespace Coal
+{
+
+class MemObject;
+class Image2D;
+class Kernel;
+class DeviceKernel;
+class DeviceInterface;
+
+/**
+ * \brief Buffer-related event
+ */
+class BufferEvent : public Event
+{
+ public:
+ BufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ virtual ~BufferEvent();
+
+ MemObject *buffer() const; /*!< \brief Buffer on which to operate */
+
+ /**
+ * \brief Check that a buffer is correctly aligned for a device
+ *
+ * OpenCL supports sub-buffers of buffers (\c Coal::SubBuffer). They
+ * have to be aligned on a certain device-dependent boundary.
+ *
+ * This function checks that \p buffer is correctly aligned for
+ * \p device. If \p buffer is not a \c Coal::SubBuffer, this function
+ * returns true.
+ *
+ * \return true if the buffer is aligned or not a \c Coal::SubBuffer
+ */
+ static bool isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device);
+
+ private:
+ MemObject *p_buffer;
+};
+
+/**
+ * \brief Reading or writing to a buffer
+ */
+class ReadWriteBufferEvent : public BufferEvent
+{
+ public:
+ ReadWriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer of the operation, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to read or write */
+ void *ptr() const; /*!< \brief Pointer in host memory at which to put the data */
+
+ private:
+ size_t p_offset, p_cb;
+ void *p_ptr;
+};
+
+/**
+ * \brief Reading a buffer
+ */
+class ReadBufferEvent : public ReadWriteBufferEvent
+{
+ public:
+ ReadBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBuffer one */
+};
+
+/**
+ * \brief Writing a buffer
+ */
+class WriteBufferEvent : public ReadWriteBufferEvent
+{
+ public:
+ WriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBuffer one */
+};
+
+/**
+ * \brief Mapping a buffer
+ */
+class MapBufferEvent : public BufferEvent
+{
+ public:
+ MapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ cl_map_flags map_flags,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapBuffer one */
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which the mapping begins, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to map */
+ cl_map_flags flags() const; /*!< \brief Flags of the mapping */
+ void *ptr() const; /*!< \brief Pointer at which the data has been mapped */
+
+ /**
+ * \brief Set the memory location at which the data has been mapped
+ *
+ * This function is called by the device when it has successfully mapped
+ * the buffer. It must be called inside
+ * \c Coal::DeviceInterface::initEventDeviceData().
+ *
+ * \param ptr the address at which the buffer has been mapped
+ */
+ void setPtr(void *ptr);
+
+ private:
+ size_t p_offset, p_cb;
+ cl_map_flags p_map_flags;
+ void *p_ptr;
+};
+
+/**
+ * \brief Mapping an image
+ */
+class MapImageEvent : public BufferEvent
+{
+ public:
+ MapImageEvent(CommandQueue *parent,
+ Image2D *image,
+ cl_map_flags map_flags,
+ const size_t origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapImage one */
+
+ /**
+ * \brief Origin of the mapping, in pixels, for the given dimension
+ * \param index dimension for which the origin is retrieved
+ * \return origin of the mapping for the given dimension
+ */
+ size_t origin(unsigned int index) const;
+
+ /**
+ * \brief Region of the mapping, in pixels, for the given dimension
+ * \param index dimension for which the region is retrieved
+ * \return region of the mapping for the given dimension
+ */
+ size_t region(unsigned int index) const;
+ cl_map_flags flags() const; /*!< \brief Flags of the mapping */
+
+ void *ptr() const; /*!< \brief Pointer at which the data is mapped */
+ size_t row_pitch() const; /*!< \brief Row pitch of the mapped data */
+ size_t slice_pitch() const; /*!< \brief Slice pitch of the mapped data */
+
+ /**
+ * \brief Set the memory location at which the image is mapped
+ *
+ * This function must be called by
+ * \c Coal::DeviceInterface::initEventDeviceData(). Row and slice pitches
+ * must also be set by this function by calling \c setRowPitch() and
+ * \c setSlicePitch().
+ *
+ * \param ptr pointer at which the data is available
+ */
+ void setPtr(void *ptr);
+ void setRowPitch(size_t row_pitch); /*!< \brief Set row pitch */
+ void setSlicePitch(size_t slice_pitch); /*!< \brief Set slice pitch */
+
+ private:
+ cl_map_flags p_map_flags;
+ size_t p_origin[3], p_region[3];
+ void *p_ptr;
+ size_t p_slice_pitch, p_row_pitch;
+};
+
+/**
+ * \brief Unmapping a memory object
+ */
+class UnmapBufferEvent : public BufferEvent
+{
+ public:
+ UnmapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ void *mapped_addr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::UnmapBuffer one */
+
+ void *mapping() const; /*!< \brief Mapped address to unmap */
+
+ private:
+ void *p_mapping;
+};
+
+/**
+ * \brief Copying between two buffers
+ */
+class CopyBufferEvent : public BufferEvent
+{
+ public:
+ CopyBufferEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~CopyBufferEvent();
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBuffer one */
+
+ MemObject *source() const; /*!< \brief Source buffer, equivalent to \c Coal::BufferEvent::buffer() */
+ MemObject *destination() const; /*!< \brief Destination buffer */
+ size_t src_offset() const; /*!< \brief Offset in the source buffer, in bytes */
+ size_t dst_offset() const; /*!< \brief Offset in the destination buffer, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to copy */
+
+ private:
+ MemObject *p_destination;
+ size_t p_src_offset, p_dst_offset, p_cb;
+};
+
+/**
+ * \brief Events related to rectangular (or cubic) memory regions
+ *
+ * This event is the base for all the *BufferRect events, and the Image ones.
+ */
+class ReadWriteCopyBufferRectEvent : public BufferEvent
+{
+ public:
+ ReadWriteCopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t src_origin(unsigned int index) const; /*!< \brief Source origin for the \p index dimension */
+ size_t dst_origin(unsigned int index) const; /*!< \brief Destination origin for the \p index dimension */
+ size_t region(unsigned int index) const; /*!< \brief Region to copy for the \p index dimension */
+ size_t src_row_pitch() const; /*!< \brief Source row pitch */
+ size_t src_slice_pitch() const; /*!< \brief Source slice pitch */
+ size_t dst_row_pitch() const; /*!< \brief Destination row pitch */
+ size_t dst_slice_pitch() const; /*!< \brief Destination slice pitch */
+ MemObject *source() const; /*!< \brief Source of the copy, for readability. Calls \c Coal::BufferEvent::buffer(). */
+
+ protected:
+ size_t p_src_origin[3], p_dst_origin[3], p_region[3];
+ size_t p_src_row_pitch, p_src_slice_pitch;
+ size_t p_dst_row_pitch, p_dst_slice_pitch;
+};
+
+/**
+ * \brief Copying between two buffers
+ */
+class CopyBufferRectEvent : public ReadWriteCopyBufferRectEvent
+{
+ public:
+ CopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferRect one */
+ MemObject *destination() const; /*!< \brief Destination buffer */
+
+ private:
+ MemObject *p_destination;
+};
+
+/**
+ * \brief Reading or writing to a buffer
+ */
+class ReadWriteBufferRectEvent : public ReadWriteCopyBufferRectEvent
+{
+ public:
+ ReadWriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ void *ptr() const; /*!< \brief Pointer in host memory in which to put the data */
+
+ private:
+ void *p_ptr;
+};
+
+/**
+ * \brief Reading a buffer
+ */
+class ReadBufferRectEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ ReadBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBufferRect one */
+};
+
+/**
+ * \brief Writing a buffer
+ */
+class WriteBufferRectEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ WriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBufferRect one */
+};
+
+/**
+ * \brief Reading or writing images
+ *
+ * This class only converts some of the arguments given to its constructor
+ * to the one of \c Coal::ReadWriteBufferRectEvent. For example, the source row
+ * and slice pitches are read from the \c Coal::Image2D object.
+ */
+class ReadWriteImageEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ ReadWriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+};
+
+/**
+ * \brief Reading an image
+ */
+class ReadImageEvent : public ReadWriteImageEvent
+{
+ public:
+ ReadImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadImage one */
+};
+
+/**
+ * \brief Writing to an image
+ */
+class WriteImageEvent : public ReadWriteImageEvent
+{
+ public:
+ WriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteImage one */
+};
+
+/**
+ * \brief Copying between two images
+ */
+class CopyImageEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyImageEvent(CommandQueue *parent,
+ Image2D *source,
+ Image2D *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImage one */
+};
+
+/**
+ * \brief Copying an image to a buffer
+ */
+class CopyImageToBufferEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyImageToBufferEvent(CommandQueue *parent,
+ Image2D *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t region[3],
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which writing the image */
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImageToBuffer one */
+
+ private:
+ size_t p_offset;
+};
+
+/**
+ * \brief Copying a buffer to an image
+ */
+class CopyBufferToImageEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyBufferToImageEvent(CommandQueue *parent,
+ MemObject *source,
+ Image2D *destination,
+ size_t src_offset,
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which the copy starts */
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferToImage one */
+
+ private:
+ size_t p_offset;
+};
+
+/**
+ * \brief Executing a native function as a kernel
+ *
+ * This event builds an argument list to give to the native function. It needs
+ * for example to replace all occurence of a \c Coal::MemObject by a pointer
+ * to data the host CPU can actually access, using
+ * \c Coal::DeviceBuffer::nativeGlobalPointer().
+ */
+class NativeKernelEvent : public Event
+{
+ public:
+ NativeKernelEvent(CommandQueue *parent,
+ void (*user_func)(void *),
+ void *args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const MemObject **mem_list,
+ const void **args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~NativeKernelEvent();
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::NativeKernel one */
+
+ void *function() const; /*!< \brief Host function to call */
+ void *args() const; /*!< \brief Args to give to the host function */
+
+ private:
+ void *p_user_func;
+ void *p_args;
+};
+
+/**
+ * \brief Executing a compiled kernel
+ */
+class KernelEvent : public Event
+{
+ public:
+ KernelEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~KernelEvent();
+
+ cl_uint work_dim() const; /*!< \brief Number of working dimensions */
+ size_t global_work_offset(cl_uint dim) const; /*!< \brief Global work offset for the \p dim dimension */
+ size_t global_work_size(cl_uint dim) const; /*!< \brief Global work size for the \p dim dimension */
+ size_t local_work_size(cl_uint dim) const; /*!< \brief Number of work-items per work-group for the \p dim dimension */
+ Kernel *kernel() const; /*!< \brief \c Coal::Kernel object to run */
+ DeviceKernel *deviceKernel() const; /*!< \brief \c Coal::DeviceKernel for the kernel and device of this event */
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::NDRangeKernel one */
+
+ private:
+ cl_uint p_work_dim;
+ size_t p_global_work_offset[MAX_WORK_DIMS],
+ p_global_work_size[MAX_WORK_DIMS],
+ p_local_work_size[MAX_WORK_DIMS],
+ p_max_work_item_sizes[MAX_WORK_DIMS];
+ Kernel *p_kernel;
+ DeviceKernel *p_dev_kernel;
+};
+
+/**
+ * \brief Executing a task kernel
+ *
+ * This event is simple a \c Coal::KernelEvent with:
+ *
+ * - \c work_dim() set to 1
+ * - \c global_work_offset() set to {0}
+ * - \c global_work_size() set to {1}
+ * - \c local_work_size() set to {1}
+ *
+ * It's in fact a \c Coal::KernelEvent containing only one single work-item.
+ */
+class TaskEvent : public KernelEvent
+{
+ public:
+ TaskEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::TaskKernel one */
+};
+
+/**
+ * \brief User event
+ *
+ * This event is a bit special as it is created by a call to
+ * \c clCreateUserEvent() and doesn't belong to an event queue. Thus, a mean had
+ * to be found for all to work.
+ *
+ * The solution is the \c addDependentCommandQueue() function, called every time
+ * the user event is added to a command queue. When this event becomes completed,
+ * \c flushQueues() is called to allow all the \c Coal::CommandQueue objects
+ * containing this event to push more events on their device.
+ *
+ * This way, command queues are not blocked by user events.
+ */
+class UserEvent : public Event
+{
+ public:
+ UserEvent(Context *context, cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::User one */
+ Context *context() const; /*!< \brief Context of this event */
+
+ private:
+ Context *p_context;
+};
+
+/**
+ * \brief Barrier event
+ */
+class BarrierEvent : public Event
+{
+ public:
+ BarrierEvent(CommandQueue *parent,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::Barrier one */
+};
+
+/**
+ * \brief Event waiting for others to complete before being completed
+ */
+class WaitForEventsEvent : public Event
+{
+ public:
+ WaitForEventsEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::WaitForEvents one */
+};
+
+/**
+ * \brief Marker event
+ */
+class MarkerEvent : public WaitForEventsEvent
+{
+ public:
+ MarkerEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::Marker one */
+};
+
+}
+
+#endif
diff --git a/src/core/icd.cpp b/src/core/icd.cpp
new file mode 100644
index 0000000..2c62035
--- /dev/null
+++ b/src/core/icd.cpp
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "CL/cl.h"
+#include "platform.h"
+#include "icd.h"
+
+void * dispatch_table[] =
+{
+ (void*) clGetPlatformIDs,
+ (void*) clGetPlatformInfo,
+ (void*) clGetDeviceIDs,
+ (void*) clGetDeviceInfo,
+ (void*) clCreateContext,
+ (void*) clCreateContextFromType,
+ (void*) clRetainContext,
+ (void*) clReleaseContext,
+ (void*) clGetContextInfo,
+ (void*) clCreateCommandQueue,
+ (void*) clRetainCommandQueue,
+ (void*) clReleaseCommandQueue,
+ (void*) clGetCommandQueueInfo,
+ (void*) 0, //clSetCommandQueueProperty,
+ (void*) clCreateBuffer,
+ (void*) clCreateImage2D,
+ (void*) clCreateImage3D,
+ (void*) clRetainMemObject,
+ (void*) clReleaseMemObject,
+ (void*) clGetSupportedImageFormats,
+ (void*) clGetMemObjectInfo,
+ (void*) clGetImageInfo,
+ (void*) clCreateSampler,
+ (void*) clRetainSampler,
+ (void*) clReleaseSampler,
+ (void*) clGetSamplerInfo,
+ (void*) clCreateProgramWithSource,
+ (void*) clCreateProgramWithBinary,
+ (void*) clRetainProgram,
+ (void*) clReleaseProgram,
+ (void*) clBuildProgram,
+ (void*) clUnloadCompiler,
+ (void*) clGetProgramInfo,
+ (void*) clGetProgramBuildInfo,
+ (void*) clCreateKernel,
+ (void*) clCreateKernelsInProgram,
+ (void*) clRetainKernel,
+ (void*) clReleaseKernel,
+ (void*) clSetKernelArg,
+ (void*) clGetKernelInfo,
+ (void*) clGetKernelWorkGroupInfo,
+ (void*) clWaitForEvents,
+ (void*) clGetEventInfo,
+ (void*) clRetainEvent,
+ (void*) clReleaseEvent,
+ (void*) clGetEventProfilingInfo,
+ (void*) clFlush,
+ (void*) clFinish,
+ (void*) clEnqueueReadBuffer,
+ (void*) clEnqueueWriteBuffer,
+ (void*) clEnqueueCopyBuffer,
+ (void*) clEnqueueReadImage,
+ (void*) clEnqueueWriteImage,
+ (void*) clEnqueueCopyImage,
+ (void*) clEnqueueCopyImageToBuffer,
+ (void*) clEnqueueCopyBufferToImage,
+ (void*) clEnqueueMapBuffer,
+ (void*) clEnqueueMapImage,
+ (void*) clEnqueueUnmapMemObject,
+ (void*) clEnqueueNDRangeKernel,
+ (void*) clEnqueueTask,
+ (void*) clEnqueueNativeKernel,
+ (void*) clEnqueueMarker,
+ (void*) clEnqueueWaitForEvents,
+ (void*) clEnqueueBarrier,
+ (void*) clGetExtensionFunctionAddress,
+ (void*) 0, //clCreateFromGLBuffer,
+ (void*) 0, //clCreateFromGLTexture2D,
+ (void*) 0, //clCreateFromGLTexture3D,
+ (void*) 0, //clCreateFromGLRenderbuffer,
+ (void*) 0, //clGetGLObjectInfo,
+ (void*) 0, //clGetGLTextureInfo,
+ (void*) 0, //clEnqueueAcquireGLObjects,
+ (void*) 0, //clEnqueueReleaseGLObjects,
+ (void*) 0, //clGetGLContextInfoKHR,
+ (void*) 0, //clGetDeviceIDsFromD3D10KHR,
+ (void*) 0, //clCreateFromD3D10BufferKHR,
+ (void*) 0, //clCreateFromD3D10Texture2DKHR,
+ (void*) 0, //clCreateFromD3D10Texture3DKHR,
+ (void*) 0, //clEnqueueAcquireD3D10ObjectsKHR,
+ (void*) 0, //clEnqueueReleaseD3D10ObjectsKHR,
+ (void*) clSetEventCallback,
+ (void*) clCreateSubBuffer,
+ (void*) clSetMemObjectDestructorCallback,
+ (void*) clCreateUserEvent,
+ (void*) clSetUserEventStatus,
+ (void*) clEnqueueReadBufferRect,
+ (void*) clEnqueueWriteBufferRect,
+ (void*) clEnqueueCopyBufferRect,
+ (void*) 0, //clCreateSubDevicesEXT,
+ (void*) 0, //clRetainDeviceEXT,
+ (void*) 0, //clReleaseDeviceEXT
+};
+
+
+cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if (num_platforms) *num_platforms = 1;
+ else if (!platforms) return CL_INVALID_VALUE;
+
+ if (!num_entries && platforms) return CL_INVALID_VALUE;
+
+ /*-------------------------------------------------------------------------
+ * Only one "default" platform
+ *------------------------------------------------------------------------*/
+ if (platforms != 0) *platforms = &the_platform;
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/icd.h b/src/core/icd.h
new file mode 100644
index 0000000..591aed6
--- /dev/null
+++ b/src/core/icd.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _ICD_H
+#define _ICD_H
+#include "CL/cl.h"
+
+typedef void *(KHRicdVendorDispatch)[];
+extern KHRicdVendorDispatch dispatch_table;
+
+class Dispatch
+{
+ public:
+ Dispatch() : dispatch(&dispatch_table) {}
+ private:
+ KHRicdVendorDispatch *dispatch;
+};
+
+#endif // _ICD_H
+
diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp
new file mode 100644
index 0000000..4c53576
--- /dev/null
+++ b/src/core/kernel.cpp
@@ -0,0 +1,637 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/kernel.cpp
+ * \brief Kernel
+ */
+
+#include "kernel.h"
+#include "propertylist.h"
+#include "program.h"
+#include "memobject.h"
+#include "sampler.h"
+#include "deviceinterface.h"
+
+#include <string>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <boost/tuple/tuple.hpp>
+
+#include <llvm/Support/Casting.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/DataLayout.h>
+
+
+using namespace Coal;
+Kernel::Kernel(Program *program)
+: Object(Object::T_Kernel, program), p_has_locals(false), wi_alloca_size(0)
+{
+ // TODO: Say a kernel is attached to the program (that becomes unalterable)
+
+ null_dep.device = 0;
+ null_dep.kernel = 0;
+ null_dep.function = 0;
+ null_dep.module = 0;
+ p_name = "";
+}
+
+Kernel::~Kernel()
+{
+ while (p_device_dependent.size())
+ {
+ DeviceDependent &dep = p_device_dependent.back();
+
+ delete dep.kernel;
+
+ p_device_dependent.pop_back();
+ }
+}
+
+const Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device) const
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return null_dep;
+}
+
+Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device)
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return null_dep;
+}
+
+/******************************************************************************
+* cl_int Kernel::addFunction
+******************************************************************************/
+cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
+ llvm::Module *module)
+{
+ llvm::DataLayout TD(module);
+
+#if 0 // Uncomment to see the Function IR being generated:
+ function->dump();
+#endif
+
+ p_name = function->getName().str();
+
+ // Get wi_alloca_size, to be used for computing wg_alloca_size
+ std::string fattrs = function->getAttributes().getAsString(
+ llvm::AttributeSet::FunctionIndex);
+ std::size_t found = fattrs.find("_wi_alloca_size=");
+ if (found != std::string::npos)
+ wi_alloca_size = atoi(fattrs.data() + found + 16);
+
+ /*-------------------------------------------------------------------------
+ * Add a device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent dep;
+
+ dep.device = device;
+ dep.function = function;
+ dep.module = module;
+
+ /*-------------------------------------------------------------------------
+ * Build the arg list of the kernel (or verify it if a previous function
+ * was already registered)
+ *------------------------------------------------------------------------*/
+ llvm::FunctionType *f = function->getFunctionType();
+ bool append = (p_args.size() == 0);
+
+ if (!append && p_args.size() != f->getNumParams())
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ int i = 0;
+ for (llvm::Function::arg_iterator I = function->arg_begin(),
+ E = function->arg_end(); I != E; ++I, i++)
+ {
+ llvm::Type *param_type = f->getParamType(i);
+ llvm::Argument *arg = I;
+ Arg::Kind kind = Arg::Invalid;
+ Arg::File file = Arg::Private;
+ unsigned short vec_dim = 1;
+
+ llvm::Type *arg_type = arg->getType();
+ const unsigned arg_store_size = TD.getTypeStoreSize(arg_type);
+
+ // LLVM IR writes parameters passed by value as pointers:
+ if (llvm::isa<llvm::PointerType>(arg_type) && arg->hasByValAttr()) {
+ arg_type = llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType();
+ }
+
+ llvm::Type *itype = TD.getSmallestLegalIntType(module->getContext(), arg_store_size * 8);
+ llvm::Type *target_type = (itype != NULL && arg_type->isIntegerTy()) ? itype : arg_type;
+
+ unsigned target_size = TD.getTypeStoreSize(target_type);
+ unsigned target_align = TD.getABITypeAlignment(target_type);
+
+#if 0 // Uncomment to see arg info
+ arg_type->dump(); std::cout << " Size: " << target_size << " Align: " << target_align << std::endl ;
+#endif
+
+ if (arg_type->isPointerTy())
+ {
+ // It's a pointer, dereference it
+ llvm::PointerType *p_type = llvm::cast<llvm::PointerType>(arg_type);
+
+ file = (Arg::File)p_type->getAddressSpace();
+ arg_type = p_type->getElementType();
+
+ // If it's a __local argument, we'll have to allocate memory at run time
+ if (file == Arg::Local)
+ p_has_locals = true;
+
+ kind = Arg::Buffer;
+
+ // If it's a struct, get its name
+ if (arg_type->isStructTy())
+ {
+ llvm::StructType *struct_type =
+ llvm::cast<llvm::StructType>(arg_type);
+ std::string struct_name = struct_type->getName().str();
+
+ if (struct_name.compare(0, 14, "struct.image2d") == 0)
+ {
+ kind = Arg::Image2D;
+ file = Arg::Global;
+ }
+ else if (struct_name.compare(0, 14, "struct.image3d") == 0)
+ {
+ kind = Arg::Image3D;
+ file = Arg::Global;
+ }
+ }
+ }
+ else
+ {
+ if (arg_type->isVectorTy())
+ {
+ // It's a vector, we need its element's type
+ llvm::VectorType *v_type = llvm::cast<llvm::VectorType>(arg_type);
+
+ vec_dim = v_type->getNumElements();
+ arg_type = v_type->getElementType();
+ }
+
+ // Get type kind
+ if (arg_type->isFloatTy())
+ {
+ kind = Arg::Float;
+ }
+ else if (arg_type->isDoubleTy())
+ {
+ kind = Arg::Double;
+ }
+ else if (arg_type->isIntegerTy())
+ {
+ llvm::IntegerType *i_type = llvm::cast<llvm::IntegerType>(arg_type);
+
+ if (i_type->getBitWidth() == 8)
+ {
+ kind = Arg::Int8;
+ }
+ else if (i_type->getBitWidth() == 16)
+ {
+ kind = Arg::Int16;
+ }
+ else if (i_type->getBitWidth() == 32)
+ {
+ // NOTE: May also be a sampler, check done in setArg
+ kind = Arg::Int32;
+ }
+ else if (i_type->getBitWidth() == 64)
+ {
+ kind = Arg::Int64;
+ }
+ }
+ }
+
+ // Check if we recognized the type
+ if (kind == Arg::Invalid)
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ // Create arg
+ Arg *a= new Arg(vec_dim, file, kind, target_align);
+
+ // If we also have a function registered, check for signature compliance
+ if (!append && (a) != p_args[i])
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ // Append arg if needed
+ if (append)
+ p_args.push_back(a);
+ }
+
+ dep.kernel = device->createDeviceKernel(this, dep.function);
+ p_device_dependent.push_back(dep);
+
+ return CL_SUCCESS;
+}
+
+llvm::Function *Kernel::function(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.function;
+}
+
+/******************************************************************************
+* cl_int Kernel::setArg
+******************************************************************************/
+cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
+{
+ if (index > p_args.size())
+ return CL_INVALID_ARG_INDEX;
+
+ Arg *arg = p_args[index];
+
+ /*-------------------------------------------------------------------------
+ * Special case for __local pointers
+ *------------------------------------------------------------------------*/
+ if (arg->file() == Arg::Local)
+ {
+ if (size == 0) return CL_INVALID_ARG_SIZE;
+ if (value != 0) return CL_INVALID_ARG_VALUE;
+
+ arg->setAllocAtKernelRuntime(size);
+ return CL_SUCCESS;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Check that size corresponds to the arg type
+ *------------------------------------------------------------------------*/
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ /*-------------------------------------------------------------------------
+ * Special case for samplers (pointers in C++, uint32 in OpenCL).
+ *------------------------------------------------------------------------*/
+ if (size == sizeof(cl_sampler) && arg_size == 4 &&
+ (*(Object **)value)->isA(T_Sampler))
+ {
+ unsigned int bitfield = (*(Sampler **)value)->bitfield();
+
+ arg->refineKind(Arg::Sampler);
+ arg->alloc();
+ arg->loadData(&bitfield, size);
+
+ return CL_SUCCESS;
+ }
+
+ // LLVM IR redefines function parameter types to fit the smallest integer type width for the ABI
+ // eg: <2xi8> (2 bytes) may actually be pushed as an i32 (4 bytes!), but this knowledge is
+ // not known to shamrock. But, we do know the parameter type alignment in addFunction().
+ // So allow sizes less than or equal to the target alignment to succeed the size test:
+ if ((size != arg_size) && (size > arg->targetAlignment())) return CL_INVALID_ARG_SIZE;
+
+ /*-------------------------------------------------------------------------
+ * Check for null values
+ *------------------------------------------------------------------------*/
+ cl_mem null_mem = 0;
+
+ if (!value)
+ {
+ switch (arg->kind())
+ {
+ /*-------------------------------------------------------------
+ * Special case buffers : value can be 0 (or point to 0)
+ *------------------------------------------------------------*/
+ case Arg::Buffer:
+ case Arg::Image2D:
+ case Arg::Image3D: value = &null_mem;
+ default: return CL_INVALID_ARG_VALUE;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Copy just the data actually passed. Expect LLVM to do the signext/zeroext.
+ *------------------------------------------------------------------------*/
+ arg->alloc();
+ arg->loadData(value, size);
+
+ return CL_SUCCESS;
+}
+
+unsigned int Kernel::numArgs() const
+{
+ return p_args.size();
+}
+
+const Kernel::Arg *Kernel::arg(unsigned int index) const
+{
+ return p_args.at(index);
+}
+
+bool Kernel::argsSpecified() const
+{
+ for (size_t i=0; i<p_args.size(); ++i)
+ if (!p_args[i]->defined()) return false;
+ return true;
+}
+
+bool Kernel::hasLocals() const
+{
+ return p_has_locals;
+}
+
+DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.kernel;
+}
+
+llvm::Module *Kernel::deviceDependentModule(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.module;
+}
+
+cl_int Kernel::info(cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_program cl_program_var;
+ cl_context cl_context_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_KERNEL_FUNCTION_NAME:
+ MEM_ASSIGN(p_name.size() + 1, p_name.c_str());
+ break;
+
+ case CL_KERNEL_NUM_ARGS:
+ SIMPLE_ASSIGN(cl_uint, p_args.size());
+ break;
+
+ case CL_KERNEL_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_KERNEL_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent()->parent());
+ break;
+
+ case CL_KERNEL_PROGRAM:
+ SIMPLE_ASSIGN(cl_program, parent());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+boost::tuple<uint,uint,uint> Kernel::reqdWorkGroupSize(llvm::Module *module) const
+{
+ llvm::NamedMDNode *kernels = module->getNamedMetadata("opencl.kernels");
+
+ boost::tuple<uint,uint,uint> zeros(0,0,0);
+
+ if (!kernels) return zeros;
+
+ for (unsigned int i=0; i<kernels->getNumOperands(); ++i)
+ {
+ llvm::MDNode *node = kernels->getOperand(i);
+
+ /*---------------------------------------------------------------------
+ * Each node has only one operand : a llvm::Function
+ *--------------------------------------------------------------------*/
+ llvm::Value *value = node->getOperand(0);
+
+ /*---------------------------------------------------------------------
+ * Bug somewhere, don't crash
+ *--------------------------------------------------------------------*/
+ if (!llvm::isa<llvm::Function>(value)) continue;
+
+ llvm::Function *f = llvm::cast<llvm::Function>(value);
+ if(f->getName().str() != p_name) continue;
+
+ if (node->getNumOperands() <= 1) return zeros;
+
+ llvm::MDNode *meta = llvm::cast<llvm::MDNode>(node->getOperand(1));
+ if (meta->getNumOperands() == 4 &&
+ meta->getOperand(0)->getName().str() == std::string("reqd_work_group_size"))
+ {
+ uint x = llvm::cast<llvm::ConstantInt> (meta->getOperand(1))->getValue().getLimitedValue();
+ uint y = llvm::cast<llvm::ConstantInt> (meta->getOperand(2))->getValue().getLimitedValue();
+ uint z = llvm::cast<llvm::ConstantInt> (meta->getOperand(3))->getValue().getLimitedValue();
+
+ return boost::tuple<uint,uint,uint> (x,y,z);
+ }
+ return zeros;
+ }
+}
+
+
+cl_int Kernel::workGroupInfo(DeviceInterface *device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ size_t size_t_var;
+ size_t three_size_t[3];
+ cl_ulong cl_ulong_var;
+ };
+
+ const DeviceDependent &dep = deviceDependent(device);
+
+ // BUG? Shouldn't we check if the kernel is associated with
+ // the default device ?
+ if (!device && p_device_dependent.size() > 1)
+ return CL_INVALID_DEVICE;
+
+ switch (param_name)
+ {
+ case CL_KERNEL_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, dep.kernel->workGroupSize());
+ break;
+
+ case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+ {
+ boost::tuple<uint,uint,uint> res(reqdWorkGroupSize(dep.module));
+ three_size_t[0] = res.get<0>();
+ three_size_t[1] = res.get<1>();
+ three_size_t[2] = res.get<2>();
+ value = &three_size_t;
+ value_length = sizeof(three_size_t);
+ }
+ break;
+
+ case CL_KERNEL_LOCAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, dep.kernel->localMemSize());
+ break;
+
+ case CL_KERNEL_PRIVATE_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, dep.kernel->privateMemSize());
+ break;
+
+ case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+ SIMPLE_ASSIGN(size_t, dep.kernel->preferredWorkGroupSizeMultiple());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/*
+ * Kernel::Arg
+ */
+Kernel::Arg::Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align)
+ : p_vec_dim(vec_dim), p_file(file), p_kind(kind), p_targ_align(targ_align), p_data(0), p_defined(false),
+ p_runtime_alloc(0)
+{ }
+
+Kernel::Arg::~Arg()
+{
+ if (p_data) std::free(p_data);
+}
+
+void Kernel::Arg::alloc()
+{
+ if (!p_data) p_data = std::calloc(p_vec_dim, valueSize());
+}
+
+void Kernel::Arg::loadData(const void *data, size_t size)
+{
+ assert ( size <= p_vec_dim * valueSize());
+ std::memcpy(p_data, data, size);
+ p_defined = true;
+}
+
+void Kernel::Arg::setAllocAtKernelRuntime(size_t size)
+{
+ p_runtime_alloc = size;
+ p_defined = true;
+}
+
+void Kernel::Arg::refineKind (Kernel::Arg::Kind kind)
+{
+ p_kind = kind;
+}
+
+bool Kernel::Arg::operator!=(const Arg &b)
+{
+ bool same = (p_vec_dim == b.p_vec_dim) &&
+ (p_file == b.p_file) &&
+ (p_kind == b.p_kind);
+
+ return !same;
+}
+
+size_t Kernel::Arg::valueSize() const
+{
+ switch (p_kind)
+ {
+ case Invalid: return 0;
+ case Int8: return 1;
+ case Int16: return 2;
+ case Int32:
+ case Sampler: return 4;
+ case Int64: return 8;
+ case Float: return sizeof(cl_float);
+ case Double: return sizeof(double);
+ case Buffer:
+ case Image2D:
+ case Image3D: return sizeof(cl_mem);
+ }
+
+ return 0;
+}
+
+unsigned short Kernel::Arg::vecDim() const { return p_vec_dim; }
+Kernel::Arg::File Kernel::Arg::file() const { return p_file; }
+Kernel::Arg::Kind Kernel::Arg::kind() const { return p_kind; }
+size_t Kernel::Arg::targetAlignment() const { return p_targ_align; }
+bool Kernel::Arg::defined() const { return p_defined; }
+const void * Kernel::Arg::data() const { return p_data; }
+size_t Kernel::Arg::allocAtKernelRuntime() const {return p_runtime_alloc;}
+
+const void *Kernel::Arg::value(unsigned short index) const
+{
+ const char *data = (const char *)p_data;
+ unsigned int offset = index * valueSize();
+
+ data += offset;
+
+ return (const void *)data;
+}
+
diff --git a/src/core/kernel.h b/src/core/kernel.h
new file mode 100644
index 0000000..80672ea
--- /dev/null
+++ b/src/core/kernel.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/kernel.h
+ * \brief Kernel
+ */
+
+#ifndef __KERNEL_H__
+#define __KERNEL_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+
+#include <vector>
+#include <string>
+#include <boost/tuple/tuple.hpp>
+
+namespace llvm
+{
+ class Function;
+ class Module;
+}
+
+namespace Coal
+{
+
+class Program;
+class DeviceInterface;
+class DeviceKernel;
+
+/**
+ * \brief Kernel
+ *
+ * A kernel represents a LLVM function that can be run on a device. As
+ * \c Coal::Kernel objects are device-independent, they in fact represent only
+ * the name of a kernel and the arguments the application wants to pass to it,
+ * but it also contains a list of LLVM functions for each device for which its
+ * parent \c Coal::Program has been built
+ */
+class Kernel : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param program Parent \c Coal::Program
+ */
+ Kernel(Program *program);
+ ~Kernel();
+
+ /**
+ * \brief Kernel argument
+ *
+ * This class holds OpenCL-related information about the arguments of
+ * a kernel. It is also used to check that a kernel takes the same
+ * arguments on every device on which it has been built.
+ */
+ class Arg
+ {
+ public:
+ /**
+ * \brief Memory address space qualifier
+ */
+ enum File
+ {
+ Private = 0, /*!< \brief __private */
+#if 1
+ Global = 1, /*!< \brief __global */
+ Constant = 2, /*!< \brief __constant */
+ Local = 3 /*!< \brief __local */
+#else
+ /* using clang defaults */
+ Global = 0xFFFF00, /*!< \brief __global */
+ Local = 0xFFFF01, /*!< \brief __local */
+ Constant = 0xFFFF02 /*!< \brief __constant */
+#endif
+ };
+
+ /**
+ * \brief Kind of argument (its datatype)
+ */
+ enum Kind
+ {
+ Invalid, /*!< \brief Invalid argument */
+ Int8, /*!< \brief \c uchar or \c char, \c i8 in LLVM */
+ Int16, /*!< \brief \c ushort or \c short, \c i16 in LLVM */
+ Int32, /*!< \brief \c uint or \c int, \c i32 in LLVM */
+ Int64, /*!< \brief \c ulong or \c long, \c i64 in LLVM */
+ Float, /*!< \brief \c float, \c float in LLVM */
+ Double, /*!< \brief \c double, \c double in LLVM */
+ Buffer, /*!< \brief \c Coal::Buffer or \c Coal::SubBuffer, <tt>type*</tt> in LLVM */
+ Image2D, /*!< \brief \c Coal::Image2D, <tt>\%struct.image2d*</tt> in LLVM */
+ Image3D, /*!< \brief \c Coal::Image3D, <tt>\%struct.image3d*</tt> in LLVM */
+ Sampler /*!< \brief \c Coal::Sampler::bitfield(), \c i32 in LLVM, see \c Coal::Kernel::setArg() */
+ };
+
+ /**
+ * \brief Constructor
+ * \param vec_dim vector dimension of the argument, 1 if not a vector
+ * \param file \c File of the argument
+ * \param kind \c Kind of the argument
+ * \param kind \c Argument type alignment (ABI specific)
+ */
+ Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align);
+ ~Arg();
+
+ /**
+ * \brief Allocate the argument
+ *
+ * This function must be called before \c loadData(). It
+ * allocates a buffer in which the argument value can be stored.
+ *
+ * \sa valueSize()
+ */
+ void alloc();
+
+ /**
+ * \brief Load a value into the argument
+ * \note \c alloc() must have been called before this function.
+ * \sa valueSize()
+ */
+ void loadData(const void *data, size_t size);
+
+ /**
+ * \brief Set the number of bytes that must be allocated at run-time
+ *
+ * \c __local arguments don't take a value given by the host
+ * application, but take pointers allocated on the device
+ * for each work-group.
+ *
+ * This function allows to set the size of the device-allocated
+ * memory buffer used by this argument.
+ *
+ * \param size size in byte of the buffer the device has to
+ * allocate for each work-group of this kernel
+ */
+ void setAllocAtKernelRuntime(size_t size);
+
+ /**
+ * \brief Changes the \c Kind of this argument
+ * \param kind new \c Kind
+ */
+ void refineKind(Kind kind);
+
+ /**
+ * \brief Compares this argument with another
+ *
+ * They are different if they \c vec_dim, \c file or \c kind are
+ * not the same.
+ *
+ * \param b other argument to compare
+ * \return true if the this arguments doesn't match \p b
+ */
+ bool operator !=(const Arg &b);
+
+ /**
+ * \brief Size of a field of this arg
+ *
+ * This function returns the size of this argument based on its
+ * \c Kind
+ *
+ * \note This size is not multiplied by \c vecDim(), you must do
+ * this by yourself to find the total space taken by this
+ * arg.
+ * \return the size of this argument, in bytes, without any padding
+ */
+ size_t valueSize() const;
+ unsigned short vecDim() const; /*!< \brief Vector dimension */
+ File file() const; /*!< \brief File */
+ Kind kind() const; /*!< \brief Kind */
+ bool defined() const; /*!< \brief Has the value of this argument already beed loaded by the host application ? */
+ size_t targetAlignment() const; /*!< \brief Get alignment (bytes) of arg type */
+ size_t allocAtKernelRuntime() const; /*!< \brief Size of the \c __local buffer to allocate at kernel runtime */
+ const void *value(unsigned short index) const; /*!< \brief Pointer to the value of this argument, for the \p index vector element */
+ const void *data() const; /*!< \brief Pointer to the data of this arg, equivalent to <tt>value(0)</tt> */
+
+ private:
+ unsigned short p_vec_dim;
+ File p_file;
+ Kind p_kind;
+ void *p_data;
+ bool p_defined;
+ size_t p_runtime_alloc;
+ size_t p_targ_align;
+ };
+
+ /**
+ * \brief Add a \c llvm::Function to this kernel
+ *
+ * This function adds a \c llvm::Function to this kernel for the
+ * specified \p device. It also has the responsibility to find the
+ * \c Arg::Kind of each of the function's arguments.
+ *
+ * LLVM provides a \c llvm::Type for each argument:
+ *
+ * - If it is a pointer, the kind of the argument is \c Arg::Buffer and
+ * its field is a simple cast from a LLVM \c addrspace to \c Arg::File.
+ * - If it is a pointer to a struct whose name is either
+ * <tt>\%struct.image2d</tt> or <tt>\%struct.image3d</tt>, kind is set
+ * to \c Arg::Image2D or \c Arg::Image3D, respectively.
+ * - If it is a vector, \c vec_dim is set to the vector size, and the
+ * rest of the computations are done on the element type
+ * - Then we translate the LLVM type to an \c Arg::Kind. For instance,
+ * \c i32 becomes \c Arg::Int32
+ *
+ * Samplers aren't detected at this stage because they are plain \c i32
+ * types on the LLVM side. They are detected in \c setArg() when the
+ * value being set to the argument appears to be a \c Coal::Sampler.
+ *
+ * \param device device for which the function is added
+ * \param function function to add
+ * \param module LLVM module of this function
+ */
+ cl_int addFunction(DeviceInterface *device, llvm::Function *function,
+ llvm::Module *module);
+
+ /**
+ * \brief Get the LLVM function for a specified \p device
+ * \param device the device for which a LLVM function is needed
+ * \return the LLVM function for the given \p device
+ */
+ llvm::Function *function(DeviceInterface *device) const;
+
+ /**
+ * \brief Set the value of an argument
+ *
+ * See the constructor's documentation for a note on the
+ * \c Coal::Sampler objects
+ *
+ * \param index index of the argument
+ * \param size size of the value being stored in the argument, must match
+ * <tt>Arg::valueSize() * Arg::vecDim()</tt>
+ * \param value pointer to the data that will be copied in the argument
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int setArg(cl_uint index, size_t size, const void *value);
+
+ unsigned int numArgs() const; /*!< \brief Number of arguments of this kernel */
+ const Arg *arg(unsigned int index) const; /*!< \brief \c Arg at the given \p index */
+
+ /*! \brief \c Coal::DeviceKernel for the specified \p device */
+ DeviceKernel *deviceDependentKernel(DeviceInterface *device) const;
+ llvm::Module *deviceDependentModule(DeviceInterface *device) const;
+
+ bool argsSpecified() const; /*!< \brief true if all the arguments have been set through \c setArg() */
+ bool hasLocals() const; /*!< \brief true if one or more argument is in file \c Arg::Local */
+
+ /**
+ * \brief Get information about this kernel
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+
+ /**
+ * \brief Get performance hints and device-specific data about this kernel
+ * \copydetails Coal::DeviceInterface::info
+ * \param device \c Coal::DeviceInterface on which the kernel will be run
+ */
+ cl_int workGroupInfo(DeviceInterface *device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ boost::tuple<uint,uint,uint> reqdWorkGroupSize(llvm::Module *module) const;
+
+ int get_wi_alloca_size() { return wi_alloca_size; }
+
+ std::string p_name;
+ private:
+ bool p_has_locals;
+ int wi_alloca_size;
+
+ struct DeviceDependent
+ {
+ DeviceInterface *device;
+ DeviceKernel *kernel;
+ llvm::Function *function;
+ llvm::Module *module;
+ };
+
+ std::vector<DeviceDependent> p_device_dependent;
+ std::vector<Arg *> p_args;
+ DeviceDependent null_dep;
+
+ const DeviceDependent &deviceDependent(DeviceInterface *device) const;
+ DeviceDependent &deviceDependent(DeviceInterface *device);
+
+};
+
+}
+
+struct _cl_kernel : public Coal::Kernel
+{};
+
+#endif
diff --git a/src/core/memobject.cpp b/src/core/memobject.cpp
new file mode 100644
index 0000000..5501ac1
--- /dev/null
+++ b/src/core/memobject.cpp
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file memobject.cpp
+ * \brief Memory objects
+ */
+
+#include "CL/cl_ext.h"
+#include "memobject.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "events.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+/*
+ * MemObject
+ */
+
+MemObject::MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr,
+ cl_int *errcode_ret)
+: Object(Object::T_MemObject, ctx), p_num_devices(0), p_flags(flags),
+ p_host_ptr(host_ptr), p_devicebuffers(0), p_dtor_callback_stack()
+{
+ // Check the flags value
+ const cl_mem_flags all_flags = CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY |
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR |
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+ |CL_MEM_USE_MSMC_TI;
+
+ if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_READ_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+ if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_WRITE_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+ if ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_WRITE_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & ~all_flags) != 0)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check other values
+ if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) != 0 && !host_ptr)
+ {
+ *errcode_ret = CL_INVALID_HOST_PTR;
+ return;
+ }
+
+ if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) == 0 && host_ptr)
+ {
+ *errcode_ret = CL_INVALID_HOST_PTR;
+ return;
+ }
+}
+
+MemObject::~MemObject()
+{
+ while (!p_dtor_callback_stack.empty())
+ {
+ dtor_callback_t callback;
+ if (p_dtor_callback_stack.pop(callback))
+ callback.first((cl_mem)this, callback.second);
+ }
+
+ if (p_devicebuffers)
+ {
+ // Also delete our children in the device
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ delete p_devicebuffers[i];
+
+ std::free((void *)p_devicebuffers);
+ }
+}
+
+cl_int MemObject::init()
+{
+ // Get the device list of the context
+ DeviceInterface **devices = 0;
+ cl_int rs;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ sizeof(unsigned int),
+ &p_num_devices, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ p_devices_to_allocate = p_num_devices;
+ devices = (DeviceInterface **)std::malloc(p_num_devices *
+ sizeof(DeviceInterface *));
+
+ if (!devices)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ p_num_devices * sizeof(DeviceInterface *),
+ devices, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ // Allocate a table of DeviceBuffers
+ p_devicebuffers = (DeviceBuffer **)std::malloc(p_num_devices *
+ sizeof(DeviceBuffer *));
+
+ if (!p_devicebuffers)
+ {
+ std::free((void *)devices);
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ // If we have more than one device, the allocation on the devices is
+ // defered to first use, so host_ptr can become invalid. So, copy it in
+ // a RAM location and keep it. Also, set a flag telling CPU devices that
+ // they don't need to reallocate and re-copy host_ptr
+ // SubBuffer should simply reuse Buffer data
+ if (p_num_devices > 1 && (p_flags & CL_MEM_COPY_HOST_PTR)
+ && type() != SubBuffer)
+ {
+ void *tmp_hostptr = std::malloc(size());
+
+ if (!tmp_hostptr)
+ {
+ std::free((void *)devices);
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ std::memcpy(tmp_hostptr, p_host_ptr, size());
+
+ p_host_ptr = tmp_hostptr;
+ // Now, the client application can safely std::free() its host_ptr
+ }
+
+ // Create a DeviceBuffer for each device
+ unsigned int failed_devices = 0;
+
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ {
+ DeviceInterface *device = devices[i];
+
+ rs = CL_SUCCESS;
+ p_devicebuffers[i] = device->createDeviceBuffer(this, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ p_devicebuffers[i] = 0;
+ failed_devices++;
+ }
+ }
+
+ if (failed_devices == p_num_devices)
+ {
+ // Each device found a reason to reject the buffer, so it's invalid
+ std::free((void *)devices);
+ return rs;
+ }
+
+ std::free((void *)devices);
+ devices = 0;
+
+ // If we have only one device, already allocate the buffer
+ if (p_num_devices == 1)
+ {
+ if (!p_devicebuffers[0]->allocate())
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ }
+
+ return CL_SUCCESS;
+}
+
+bool MemObject::allocate(DeviceInterface *device)
+{
+ DeviceBuffer *buffer = deviceBuffer(device);
+
+ if (!buffer->allocated())
+ {
+ return buffer->allocate();
+ }
+
+ return true;
+}
+
+cl_mem_flags MemObject::flags() const
+{
+ return p_flags;
+}
+
+void *MemObject::host_ptr() const
+{
+ if (type() != SubBuffer)
+ return p_host_ptr;
+ else
+ {
+ const class SubBuffer *subbuf = (const class SubBuffer *)this;
+ char *tmp = (char *)subbuf->parent()->host_ptr();
+
+ if (!tmp) return 0;
+
+ tmp += subbuf->offset();
+
+ return (void *)tmp;
+ }
+}
+
+DeviceBuffer *MemObject::deviceBuffer(DeviceInterface *device) const
+{
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ {
+ if (p_devicebuffers[i]->device() == device)
+ return p_devicebuffers[i];
+ }
+
+ return 0;
+}
+
+void MemObject::deviceAllocated(DeviceBuffer *buffer)
+{
+ (void) buffer;
+
+ // Decrement the count of devices that must be allocated. If it becomes
+ // 0, it means we don't need to keep a copied host_ptr and that we can
+ // std::free() it.
+ p_devices_to_allocate--;
+
+ if (p_devices_to_allocate == 0 &&
+ p_num_devices > 1 &&
+ (p_flags & CL_MEM_COPY_HOST_PTR))
+ {
+ std::free(p_host_ptr);
+ p_host_ptr = 0;
+ }
+
+}
+
+void MemObject::setDestructorCallback(void (CL_CALLBACK *pfn_notify)
+ (cl_mem memobj, void *user_data),
+ void *user_data)
+{
+ p_dtor_callback_stack.push(dtor_callback_t(pfn_notify, user_data));
+}
+
+// HACK for the union
+typedef void * void_p;
+
+cl_int MemObject::info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ class SubBuffer *subbuf = (class SubBuffer *)this;
+
+ union {
+ cl_mem_object_type cl_mem_object_type_var;
+ cl_mem_flags cl_mem_flags_var;
+ size_t size_t_var;
+ void_p void_p_var;
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ cl_mem cl_mem_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_MEM_TYPE:
+ switch (type())
+ {
+ case Buffer:
+ case SubBuffer:
+ cl_mem_object_type_var = CL_MEM_OBJECT_BUFFER;
+ break;
+
+ case Image2D:
+ cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE2D;
+ break;
+
+ case Image3D:
+ cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE3D;
+ break;
+ }
+ value = (void *)&cl_mem_object_type_var;
+ value_length = sizeof(cl_mem_object_type);
+ break;
+
+ case CL_MEM_FLAGS:
+ SIMPLE_ASSIGN(cl_mem_flags, p_flags);
+ break;
+
+ case CL_MEM_SIZE:
+ SIMPLE_ASSIGN(size_t, size());
+ break;
+
+ case CL_MEM_HOST_PTR:
+ SIMPLE_ASSIGN(void_p, host_ptr());
+ break;
+
+ case CL_MEM_MAP_COUNT:
+ SIMPLE_ASSIGN(cl_uint, 0); // TODO
+ break;
+
+ case CL_MEM_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_MEM_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_MEM_ASSOCIATED_MEMOBJECT:
+ if (type() != SubBuffer)
+ SIMPLE_ASSIGN(cl_mem, 0)
+ else
+ SIMPLE_ASSIGN(cl_mem, subbuf->parent());
+ break;
+
+ case CL_MEM_OFFSET:
+ if (type() != SubBuffer)
+ SIMPLE_ASSIGN(cl_mem, 0)
+ else
+ SIMPLE_ASSIGN(cl_mem, subbuf->offset());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/*
+ * Buffer
+ */
+
+Buffer::Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags,
+ cl_int *errcode_ret)
+: MemObject(ctx, flags, host_ptr, errcode_ret), p_size(size)
+{
+ if (size == 0)
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+#if defined(__arm__)
+ if (size > 512*1024*1024)
+#else
+ if (size > 1*1024*1024*1024)
+#endif
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ // CL_MEM_READ_WRITE is default if not specified {READ,WRITE}_ONLY
+ if (! (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)))
+ p_flags |= CL_MEM_READ_WRITE;
+}
+
+size_t Buffer::size() const
+{
+ return p_size;
+}
+
+MemObject::Type Buffer::type() const
+{
+ return MemObject::Buffer;
+}
+
+/*----------------------------------------------------------------------------
+ * mapped_event: MapBufferEvent when the Map is on a Buffer
+ * RETURN: true if successful, false if fail
+ * Traverse currently mapped event list, check overlapping and if either is
+ * WRITE, insert into list in the increasing order of offset
+ * TODO: do we need to lock the list for operation???
+ *---------------------------------------------------------------------------*/
+bool Buffer::addMapEvent(BufferEvent *mapped_event)
+{
+ MapBufferEvent *mbe = (MapBufferEvent *) mapped_event;
+ size_t mbe_offset = mbe->offset();
+ if (mbe->buffer()->type() == SubBuffer)
+ mbe_offset += ((class SubBuffer *) mbe->buffer())->offset();
+
+ std::list<BufferEvent *>::iterator it, it_insert = p_mapped_events.end();
+ for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it)
+ {
+ MapBufferEvent *e = (MapBufferEvent *) (*it);
+ size_t e_offset = e->offset();
+ if (e->buffer()->type() == SubBuffer)
+ e_offset += ((class SubBuffer *) e->buffer())->offset();
+ if (mbe_offset < e_offset) it_insert = it;
+
+ if ( mbe_offset <= e_offset + e->cb() - 1
+ && e_offset <= mbe_offset + mbe->cb() - 1)
+ if ((mbe->flags() & CL_MAP_WRITE) ||
+ (e->flags() & CL_MAP_WRITE))
+ return false;
+ }
+
+ p_mapped_events.insert(it_insert, mapped_event);
+ return true;
+}
+
+/*----------------------------------------------------------------------------
+ * mapped_ptr: mapped pointer from previous MapBuffer/MapImage Event
+ * RETURN: first MappedBufferEvent with same mapped_ptr in the list
+ * TODO: do we need to lock the list for operation???
+ *---------------------------------------------------------------------------*/
+BufferEvent* Buffer::removeMapEvent(void *mapped_ptr)
+{
+ std::list<BufferEvent *>::iterator it;
+ for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it)
+ {
+ MapBufferEvent *e = (MapBufferEvent *) (*it);
+ if (e->ptr() != mapped_ptr) continue;
+ p_mapped_events.erase(it);
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * SubBuffer
+ */
+
+SubBuffer::SubBuffer(class Buffer *parent, size_t offset, size_t size,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: MemObject((Context *)parent->parent(), flags, 0, errcode_ret), p_offset(offset),
+ p_size(size), p_parent(parent)
+{
+ clRetainMemObject((cl_mem) p_parent);
+
+ if (size == 0)
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ if (offset + size > parent->size())
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ // Check the compatibility of flags and parent->flags()
+ const cl_mem_flags wrong_flags =
+ CL_MEM_ALLOC_HOST_PTR |
+ CL_MEM_USE_HOST_PTR |
+ CL_MEM_COPY_HOST_PTR;
+
+ if (flags & wrong_flags)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((parent->flags() & CL_MEM_WRITE_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY)))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((parent->flags() & CL_MEM_READ_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY)))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // OpenCL 1.2: SubBuffer should inherit some of parent Buffer flags
+ cl_mem_flags parent_rw_flags = parent->flags()
+ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
+ cl_mem_flags my_rw_flags = p_flags
+ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
+ // parent be READ_WRITE, subBuffer be READ_ONLY/WRITE_ONLY (Spec allows)
+ if (! my_rw_flags) p_flags |= parent_rw_flags;
+ cl_mem_flags parent_hostptr_flags = parent->flags()
+ & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR);
+ if (parent_hostptr_flags) p_flags |= parent_hostptr_flags;
+}
+
+SubBuffer::~SubBuffer()
+{
+ clReleaseMemObject((cl_mem) p_parent);
+}
+
+size_t SubBuffer::size() const
+{
+ return p_size;
+}
+
+MemObject::Type SubBuffer::type() const
+{
+ return MemObject::SubBuffer;
+}
+
+bool SubBuffer::allocate(DeviceInterface *device)
+{
+ // SubBuffer always use Buffer's data
+ return p_parent->allocate(device);
+}
+
+size_t SubBuffer::offset() const
+{
+ return p_offset;
+}
+
+Buffer *SubBuffer::parent() const
+{
+ return p_parent;
+}
+
+bool SubBuffer::addMapEvent(BufferEvent *mapped_event)
+{
+ return p_parent->addMapEvent(mapped_event);
+}
+
+BufferEvent* SubBuffer::removeMapEvent(void *mapped_ptr)
+{
+ return p_parent->removeMapEvent(mapped_ptr);
+}
+
+/*
+ * Image2D
+ */
+
+Image2D::Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: MemObject(ctx, flags, host_ptr, errcode_ret),
+ p_width(width), p_height(height), p_row_pitch(row_pitch)
+{
+ if (!width || !height)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ if (!format)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+
+ p_format = *format;
+
+ // Check format descriptor
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT_101010:
+ case CL_UNORM_SHORT_555:
+ case CL_UNORM_SHORT_565:
+ if (p_format.image_channel_order != CL_RGB ||
+ p_format.image_channel_order != CL_RGBx)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ }
+
+ switch (p_format.image_channel_order)
+ {
+ case CL_LUMINANCE:
+ case CL_INTENSITY:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT8:
+ case CL_UNORM_INT16:
+ case CL_SNORM_INT8:
+ case CL_SNORM_INT16:
+ case CL_HALF_FLOAT:
+ case CL_FLOAT:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+
+ case CL_RGB:
+ case CL_RGBx:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_SHORT_555:
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_INT_101010:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+
+ case CL_ARGB:
+ case CL_BGRA:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT8:
+ case CL_SNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+ }
+
+ // Row pitch
+ p_row_pitch = width * pixel_size(p_format);
+
+ if (row_pitch)
+ {
+ if (!host_ptr)
+ {
+ // row_pitch must be 0 if host_ptr is null
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (row_pitch < p_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (row_pitch % pixel_size(p_format) != 0)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ p_row_pitch = row_pitch;
+ }
+}
+
+size_t Image2D::size() const
+{
+ return height() * row_pitch();
+}
+
+MemObject::Type Image2D::type() const
+{
+ return MemObject::Image2D;
+}
+
+size_t Image2D::width() const
+{
+ return p_width;
+}
+
+size_t Image2D::height() const
+{
+ return p_height;
+}
+
+size_t Image2D::row_pitch() const
+{
+ return p_row_pitch;
+}
+
+size_t Image2D::slice_pitch() const
+{
+ // An Image2D is made of only one slice
+ return size();
+}
+
+const cl_image_format &Image2D::format() const
+{
+ return p_format;
+}
+
+cl_int Image2D::imageInfo(cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ class Image3D *image3D = (class Image3D *)this;
+
+ union {
+ cl_image_format cl_image_format_var;
+ size_t size_t_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_IMAGE_FORMAT:
+ SIMPLE_ASSIGN(cl_image_format, format());
+ break;
+
+ case CL_IMAGE_ELEMENT_SIZE:
+ SIMPLE_ASSIGN(size_t, element_size(p_format));
+ break;
+
+ case CL_IMAGE_ROW_PITCH:
+ // TODO: What was given when the image was created or width*size ?
+ SIMPLE_ASSIGN(size_t, row_pitch());
+ break;
+
+ case CL_IMAGE_SLICE_PITCH:
+ if (type() == Image3D)
+ SIMPLE_ASSIGN(size_t, image3D->slice_pitch())
+ else
+ SIMPLE_ASSIGN(size_t, 0);
+ break;
+
+ case CL_IMAGE_WIDTH:
+ SIMPLE_ASSIGN(size_t, width());
+ break;
+
+ case CL_IMAGE_HEIGHT:
+ SIMPLE_ASSIGN(size_t, height());
+ break;
+
+ case CL_IMAGE_DEPTH:
+ if (type() == Image3D)
+ SIMPLE_ASSIGN(size_t, image3D->depth())
+ else
+ SIMPLE_ASSIGN(size_t, 0);
+ break;
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+size_t Image2D::element_size(const cl_image_format &format)
+{
+ switch (format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ return 1;
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16:
+ case CL_SIGNED_INT16:
+ case CL_UNSIGNED_INT16:
+ return 2;
+ case CL_SIGNED_INT32:
+ case CL_UNSIGNED_INT32:
+ return 4;
+ case CL_FLOAT:
+ return sizeof(float);
+ case CL_HALF_FLOAT:
+ return 2;
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ return 2;
+ case CL_UNORM_INT_101010:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+unsigned int Image2D::channels(const cl_image_format &format)
+{
+ switch (format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ return 1;
+ break;
+
+ case CL_RG:
+ case CL_RGx:
+ case CL_RA:
+ return 2;
+ break;
+
+ case CL_RGBA:
+ case CL_ARGB:
+ case CL_BGRA:
+ return 4;
+ break;
+
+ case CL_RGBx:
+ case CL_RGB:
+ return 1; // Only special data types allowed (565, 555, etc)
+ break;
+
+ default:
+ return 0;
+ }
+}
+
+size_t Image2D::pixel_size(const cl_image_format &format)
+{
+ switch (format.image_channel_data_type)
+ {
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ return 2;
+ case CL_UNORM_INT_101010:
+ return 4;
+ default:
+ return channels(format) * element_size(format);
+ }
+}
+
+size_t Image2D::element_size() const
+{
+ return element_size(p_format);
+}
+
+size_t Image2D::pixel_size() const
+{
+ return pixel_size(p_format);
+}
+
+unsigned int Image2D::channels() const
+{
+ return channels(p_format);
+}
+
+/*
+ * Image3D
+ */
+
+Image3D::Image3D(Context *ctx, size_t width, size_t height, size_t depth,
+ size_t row_pitch, size_t slice_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: Image2D(ctx, width, height, row_pitch, format, host_ptr, flags, errcode_ret),
+ p_depth(depth)
+{
+ if (depth <= 1)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ // Slice pitch
+ p_slice_pitch = height * this->row_pitch();
+
+ if (slice_pitch)
+ {
+ if (!host_ptr)
+ {
+ // slice_pitch must be 0 if host_ptr is null
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (slice_pitch < p_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (slice_pitch % this->row_pitch() != 0)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ p_slice_pitch = slice_pitch;
+ }
+}
+
+size_t Image3D::size() const
+{
+ return depth() * slice_pitch();
+}
+
+MemObject::Type Image3D::type() const
+{
+ return MemObject::Image3D;
+}
+
+size_t Image3D::depth() const
+{
+ return p_depth;
+}
+
+size_t Image3D::slice_pitch() const
+{
+ return p_slice_pitch;
+}
diff --git a/src/core/memobject.h b/src/core/memobject.h
new file mode 100644
index 0000000..82cbfab
--- /dev/null
+++ b/src/core/memobject.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file memobject.h
+ * \brief Memory objects
+ */
+
+#ifndef __MEMOBJECT_H__
+#define __MEMOBJECT_H__
+
+#include "object.h"
+#include "dsp/u_concurrent_stack.h"
+
+#include <CL/cl.h>
+
+namespace Coal
+{
+
+class DeviceBuffer;
+class Context;
+class DeviceInterface;
+class BufferEvent;
+
+/**
+ * \brief Base class for all the memory objects
+ */
+class MemObject : public Object
+{
+ public:
+ /**
+ * \brief Type of memory object
+ */
+ enum Type
+ {
+ Buffer,
+ SubBuffer,
+ Image2D,
+ Image3D
+ };
+
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param flags memory object flags
+ * \param host_ptr host pointer used by some flags (see the OpenCL spec)
+ * \param errcode_ret return value
+ * \note Don't do any initialization here, but in \c init(). We only fill
+ * the private variables and check the values passed in argument.
+ * \sa init
+ */
+ MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr,
+ cl_int *errcode_ret);
+ virtual ~MemObject();
+
+ /**
+ * \brief Initialize the memory object
+ *
+ * Memory objects are device-independent classes. This function creates
+ * one \c Coal::DeviceBuffer per device present in the context by
+ * calling \c Coal::DeviceInterface::createDeviceBuffer().
+ *
+ * If there is only one device, its \c Coal::DeviceBuffer is directly
+ * allocated. If there are more than one device, the allocation is
+ * deferred until a \c Coal::Event is pushed for this device.
+ *
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ virtual cl_int init();
+ virtual bool allocate(DeviceInterface *device); /*!< \brief Allocate this memory object on the given \p device */
+ virtual size_t size() const = 0; /*!< \brief Device-independent size of the memory object */
+ virtual Type type() const = 0; /*!< \brief Type of the memory object */
+
+ cl_mem_flags flags() const; /*!< \brief Flags */
+ void *host_ptr() const; /*!< \brief Host pointer */
+ DeviceBuffer *deviceBuffer(DeviceInterface *device) const; /*!< \brief \c Coal::DeviceBuffer for the given \p device */
+
+ void deviceAllocated(DeviceBuffer *buffer); /*!< \brief Is the \c Coal::DeviceBuffer for \p buffer allocated ? */
+
+ /**
+ * \brief Set a destructor callback for this memory object
+ *
+ * This callback is called when this memory object is deleted. It is
+ * currently called from the destructor, so the memory object is already
+ * invalid, but as OpenCL objects are immutable, the callback cannot
+ * use its \c memobj parameter except in a pointer comparison, and there
+ * is no problem.
+ *
+ * \param pfn_notify function to call when the memory object is deleted
+ * \param user_data user data to pass to this function
+ */
+ void setDestructorCallback(void (CL_CALLBACK *pfn_notify)(cl_mem memobj,
+ void *user_data),
+ void *user_data);
+
+ /**
+ * \brief Get information about this memory object
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ virtual bool addMapEvent(BufferEvent *mapped_event) { return false; }
+ virtual BufferEvent* removeMapEvent(void *mapped_ptr) { return NULL; }
+
+ protected:
+ cl_mem_flags p_flags;
+ std::list<BufferEvent *> p_mapped_events;
+
+ private:
+ unsigned int p_num_devices, p_devices_to_allocate;
+ void *p_host_ptr;
+ DeviceBuffer **p_devicebuffers;
+
+ typedef std::pair<void (CL_CALLBACK *)(cl_mem memobj, void *user_data), void*> dtor_callback_t;
+ concurrent_stack<dtor_callback_t> p_dtor_callback_stack;
+
+ //void (CL_CALLBACK *p_dtor_callback)(cl_mem memobj, void *user_data);
+ //void *p_dtor_userdata;
+};
+
+/**
+ * \brief Simple buffer object
+ */
+class Buffer : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param size size of the buffer, in bytes
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags,
+ cl_int *errcode_ret);
+
+ size_t size() const; /*!< \brief Size of the buffer, in bytes */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Buffer */
+
+ bool addMapEvent(BufferEvent *mapped_event);
+ BufferEvent* removeMapEvent(void *mapped_ptr);
+ private:
+ size_t p_size;
+
+};
+
+/**
+ * \brief Sub-buffer
+ */
+class SubBuffer : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param parent parent \c Coal::Buffer
+ * \param offset offset in \p parent of the start of this sub-buffer
+ * \param size size of the sub-buffer
+ * \param flags memory flags (must be compatible with the \p parent's ones)
+ * \param errcode_ret return code
+ */
+ SubBuffer(class Buffer *parent, size_t offset, size_t size,
+ cl_mem_flags flags, cl_int *errcode_ret);
+ ~SubBuffer();
+
+ size_t size() const; /*!< \brief Size */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::SubBuffer */
+ bool allocate(DeviceInterface *device); /*!< \brief Allocate the \b parent \c Coal::Buffer */
+
+ size_t offset() const; /*!< \brief Offset in bytes */
+ class Buffer *parent() const; /*!< \brief Parent \c Coal::Buffer */
+
+ bool addMapEvent(BufferEvent *mapped_event);
+ BufferEvent* removeMapEvent(void *mapped_ptr);
+ private:
+ size_t p_offset, p_size;
+ class Buffer *p_parent;
+};
+
+/**
+ * \brief 2D image
+ */
+class Image2D : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param width width of the image
+ * \param height height of the image
+ * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt>
+ * \param format image format
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret);
+
+ virtual size_t size() const; /*!< \brief Size in bytes */
+ virtual Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image2D */
+
+ size_t width() const; /*!< \brief Width */
+ size_t height() const; /*!< \brief Height */
+ size_t row_pitch() const; /*!< \brief Size in bytes of a row of pixels */
+ virtual size_t slice_pitch() const; /*!< \brief Size in bytes of the image */
+ const cl_image_format &format() const; /*!< \brief Image format descriptor */
+
+ /**
+ * \brief Information about this image object
+ *
+ * This function is also usable for \c Coal::Image3D objects as it does
+ * casting when necessary in order to give information when needed.
+ *
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int imageInfo(cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ static size_t element_size(const cl_image_format &format); /*!< \brief Size in bytes of each channel of \p format */
+ static unsigned int channels(const cl_image_format &format);/*!< \brief Number of channels of \p format */
+ static size_t pixel_size(const cl_image_format &format); /*!< \brief Size in bytes of a pixel in \p format */
+ size_t pixel_size() const; /*!< \brief Pixel size of this image */
+ size_t element_size() const; /*!< \brief Channel size of this image */
+ unsigned int channels() const; /*!< \brief Number of channels of this image */
+
+ private:
+ size_t p_width, p_height, p_row_pitch;
+ cl_image_format p_format;
+};
+
+/**
+ * \brief 3D image
+ */
+class Image3D : public Image2D
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param width width of the image
+ * \param height height of the image
+ * \param depth depth of the image
+ * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt>
+ * \param slice_pitch number of bytes in a 2D slice. If 0, defaults to <tt>height * row_pitch()</tt>
+ * \param format image format
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Image3D(Context *ctx, size_t width, size_t height, size_t depth,
+ size_t row_pitch, size_t slice_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret);
+
+ size_t size() const; /*!< \brief Size in bytes of this image */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image3D */
+
+ size_t depth() const; /*!< \brief Depth of the image */
+ size_t slice_pitch() const; /*!< \brief Size in bytes of a 2D slice */
+
+ private:
+ size_t p_depth, p_slice_pitch;
+};
+
+}
+
+struct _cl_mem : public Coal::MemObject
+{};
+
+#endif
diff --git a/src/core/object.cpp b/src/core/object.cpp
new file mode 100644
index 0000000..be44279
--- /dev/null
+++ b/src/core/object.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file object.cpp
+ * \brief Reference-counted object tree
+ */
+
+#include "object.h"
+
+using namespace Coal;
+
+static std::list<Object *>& getKnownObjects()
+{
+ static std::list<Object *> known_objects;
+ return known_objects;
+}
+
+
+Object::Object(Type type, Object *parent)
+: p_references(1), p_parent(parent), p_type(type), p_release_parent(true)
+{
+ if (parent)
+ parent->reference();
+
+ // Add object in the list of known objects
+ getKnownObjects().push_front(this);
+ p_it = getKnownObjects().begin();
+}
+
+Object::~Object()
+{
+ if (p_parent && p_parent->dereference() && p_release_parent)
+ delete p_parent;
+
+ // Remove object from the list of known objects
+ getKnownObjects().erase(p_it);
+}
+
+void Object::reference()
+{
+ p_references++;
+}
+
+bool Object::dereference()
+{
+ p_references--;
+ return (p_references == 0);
+}
+
+void Object::setReleaseParent (bool release)
+{
+ p_release_parent = release;
+}
+
+unsigned int Object::references() const
+{
+ return p_references;
+}
+
+Object *Object::parent() const
+{
+ return p_parent;
+}
+
+Object::Type Object::type() const
+{
+ return p_type;
+}
+
+bool Object::isA(Object::Type type) const
+{
+ // Check for null values
+ if (this == 0)
+ return false;
+
+ // Check that the value isn't garbage or freed pointer
+ std::list<Object *>::const_iterator it = getKnownObjects().begin(),
+ e = getKnownObjects().end();
+ while (it != e)
+ {
+ if (*it == this)
+ // OK, NOW it is safe to dereference this ptr:
+ return this->type() == type;
+
+ ++it;
+ }
+
+ return false;
+}
diff --git a/src/core/object.h b/src/core/object.h
new file mode 100644
index 0000000..d83e326
--- /dev/null
+++ b/src/core/object.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file object.h
+ * \brief Object tree
+ */
+
+#ifndef __REFCOUNTED_H__
+#define __REFCOUNTED_H__
+
+#include <list>
+
+namespace Coal
+{
+
+/**
+ * \brief Base class of all the Clover objects
+ *
+ * This class implements functions needed by all the Clover objects, like
+ * reference counting, the object tree (parents/children), etc.
+ *
+ * It also uses a special list of known objects, used to check that a pointer
+ * passed by the user to an OpenCL function actually is an object of the correct
+ * type. See \c isA().
+ */
+class Object
+{
+ public:
+ /**
+ * \brief Type of object the inherited class actually is
+ */
+ enum Type
+ {
+ T_Device, /*!< \brief \c Coal::DeviceInterface */
+ T_CommandQueue, /*!< \brief \c Coal::CommandQueue */
+ T_Event, /*!< \brief \c Coal::Event */
+ T_Context, /*!< \brief \c Coal::Context */
+ T_Kernel, /*!< \brief \c Coal::Kernel */
+ T_MemObject, /*!< \brief \c Coal::MemObject */
+ T_Program, /*!< \brief \c Coal::Program */
+ T_Sampler /*!< \brief \c Coal::Sampler */
+ };
+
+ /**
+ * \brief Constructor
+ * \param type type of the child class calling this constructor
+ * \param parent parent object
+ */
+ Object(Type type, Object *parent = 0);
+ virtual ~Object();
+
+ /**
+ * \brief Increments the reference counter
+ */
+ void reference();
+
+ /**
+ * \brief Decrements the reference counter
+ * \return true if the reference counter has reached 0
+ */
+ bool dereference();
+
+ /**
+ * \brief Reference counter
+ * \return the number of references of this class currently in use
+ */
+ unsigned int references() const;
+
+ /**
+ * \brief Set if the parent object has to be deleted if its reference count reaches 0
+ *
+ * The destructor of \c Coal::Object dereferences its parent object.
+ * This is done in order to correctly free objects when no object has
+ * a reference to it anymore.
+ *
+ * Some objects such as \c Coal::CommandQueue need to do some operations
+ * before being deleted. This function tells \c Coal::Object to
+ * dereference its parent object, but not to call \b delete on it.
+ *
+ * \param release true to have \b delete called on the parent object
+ * when its reference count reaches 0, false to keep it
+ */
+ void setReleaseParent(bool release);
+
+ Object *parent() const; /*!< \brief Parent object */
+ Type type() const; /*!< \brief Type */
+
+ /**
+ * \brief Returns whether this object is an instance of \p type
+ * \note This function begins with a NULL-check on the \c this pointer,
+ * so it's safe to use even when \c this is not guaranteed not to
+ * be NULL.
+ * \param type type this object must have for the check to pass
+ * \return true if this object exists and has the correct type
+ */
+ bool isA(Type type) const;
+
+ private:
+ unsigned int p_references;
+ Object *p_parent;
+ Type p_type;
+ std::list<Object *>::iterator p_it;
+ bool p_release_parent;
+};
+
+}
+
+#endif
diff --git a/src/core/platform.cpp b/src/core/platform.cpp
new file mode 100644
index 0000000..1af6153
--- /dev/null
+++ b/src/core/platform.cpp
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <list>
+#include <iostream>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "platform.h"
+#include "propertylist.h"
+#include "object.h"
+#include "cpu/device.h"
+#ifndef SHAMROCK_BUILD
+#include "dsp/device.h"
+#include "dsp/driver.h"
+#endif
+
+/*-----------------------------------------------------------------------------
+* For the lock file
+*----------------------------------------------------------------------------*/
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+using namespace Coal;
+
+/******************************************************************************
+* begin_file_lock_crit_section
+******************************************************************************/
+static int begin_file_lock_crit_section(char* fname)
+{
+ /*---------------------------------------------------------------------
+ * Create a lock, so only 1 OpenCL program can progress at a time.
+ * I'm not sure about the appropriateness of putting this in the ctor.
+ * We may look at delayed ctor of platform with this in it.
+ *--------------------------------------------------------------------*/
+ int lock_fd = open(fname, O_CREAT,
+ S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH);
+
+ std::string str_fname(fname);
+
+ if (lock_fd < 0)
+ {
+ std::cout << "Can not open lock file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+
+ int res = flock(lock_fd, LOCK_EX|LOCK_NB);
+ if (res == -1)
+ {
+ if (errno == EWOULDBLOCK)
+ {
+ std::cout << "Waiting on lock " << str_fname << " ..." << std::endl;
+ res = flock(lock_fd, LOCK_EX);
+ if (res == -1)
+ {
+ std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+ else std::cout << "Acquired lock " << str_fname << ", Proceeding!" << std::endl;
+ }
+ else
+ {
+ std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+ }
+
+ return lock_fd;
+
+}
+
+namespace Coal
+{
+ Platform::Platform() : dispatch(&dispatch_table)
+ {
+ char filename[] = "/var/lock/opencl";
+ p_lock_fd = begin_file_lock_crit_section(filename);
+
+ p_devices.push_back((_cl_device_id*)new Coal::CPUDevice);
+
+ // Driver class only exists for the DSPDevice, so need this guard:
+#ifndef SHAMROCK_BUILD
+ for (int i = 0; i < Driver::instance()->num_dsps(); i++)
+ p_devices.push_back((_cl_device_id*)new Coal::DSPDevice(i));
+#endif
+ }
+
+ Platform::~Platform()
+ {
+ flock(p_lock_fd, LOCK_UN);
+ close(p_lock_fd);
+
+ for (int i = 0; i < p_devices.size(); i++)
+ delete p_devices[i];
+ }
+
+ cl_uint Platform::getDevices(cl_device_type device_type,
+ cl_uint num_entries, cl_device_id * devices)
+ {
+ cl_uint device_number = 0;
+
+ if (device_type == CL_DEVICE_TYPE_DEFAULT)
+#ifdef SHAMROCK_BUILD
+ device_type = CL_DEVICE_TYPE_CPU;
+#else
+ device_type = CL_DEVICE_TYPE_ACCELERATOR;
+#endif
+
+ for (int d = 0; d < p_devices.size(); d++)
+ {
+ cl_device_type type;
+ p_devices[d]->info(CL_DEVICE_TYPE, sizeof(cl_device_type), &type,0);
+
+ if (type & device_type)
+ {
+ if (devices && device_number < num_entries)
+ devices[device_number++] = p_devices[d];
+ else device_number++;
+ }
+ }
+
+ return device_number;
+ }
+
+ cl_int Platform::info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+ {
+ void *value = 0;
+ size_t value_length = 0;
+
+ switch (param_name)
+ {
+ case CL_PLATFORM_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_PLATFORM_VERSION:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("OpenCL 1.1 Shamrock ");
+#else
+ STRING_ASSIGN("OpenCL 1.1 TI ");
+#endif
+ break;
+
+ case CL_PLATFORM_NAME:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("Shamrock OpenCL for Arm");
+#else
+#if defined(__arm__)
+ STRING_ASSIGN("TI OpenCL for Arm + Dsp");
+#else
+ STRING_ASSIGN("TI OpenCL for Advantech DSPC868x");
+#endif
+#endif
+ break;
+
+ case CL_PLATFORM_VENDOR:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("Open Source Software");
+#else
+ STRING_ASSIGN("Texas Instruments, Inc.");
+#endif
+ break;
+
+ case CL_PLATFORM_EXTENSIONS:
+ // TODO add cl_khr_icd when it works
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64");
+#else
+ STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64 cl_ti_msmc_buffers");
+#endif
+ break;
+
+ case CL_PLATFORM_ICD_SUFFIX_KHR:
+#ifndef SHAMROCK_BUILD
+ STRING_ASSIGN("TI");
+#endif
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+ }
+};
+
+_cl_platform_id the_platform;
diff --git a/src/core/platform.h b/src/core/platform.h
new file mode 100644
index 0000000..809d12c
--- /dev/null
+++ b/src/core/platform.h
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __PLATFORM_H__
+#define __PLATFORM_H__
+
+#include <CL/cl.h>
+#include <vector>
+#include <cstring>
+#include "icd.h"
+
+namespace Coal
+{
+
+class Platform
+{
+ public:
+ Platform();
+ ~Platform();
+
+ cl_uint getDevices(cl_device_type device_type,
+ cl_uint num_entries, cl_device_id * devices);
+
+ cl_int info(cl_platform_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ private:
+ KHRicdVendorDispatch *dispatch;
+ std::vector <cl_device_id> p_devices;
+ int p_lock_fd;
+};
+
+}
+
+struct _cl_platform_id : public Coal::Platform
+{};
+
+extern _cl_platform_id the_platform;
+#endif
diff --git a/src/core/program.cpp b/src/core/program.cpp
new file mode 100644
index 0000000..5f6e99f
--- /dev/null
+++ b/src/core/program.cpp
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/program.cpp
+ * \brief Program
+ */
+
+#include "program.h"
+#include "context.h"
+#include "compiler.h"
+#include "kernel.h"
+#include "propertylist.h"
+#include "deviceinterface.h"
+
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <set>
+#include <algorithm>
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorOr.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Linker/Linker.h>
+#include <llvm/PassManager.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/InstIterator.h>
+
+#include <runtime/stdlib.c.bc.embed.h>
+
+
+/*-----------------------------------------------------------------------------
+* temporary for source file cacheing, remove from product releases
+*----------------------------------------------------------------------------*/
+//#include "dsp/source_cache.h"
+//source_cache * source_cache::pInstance = 0;
+
+using namespace Coal;
+using namespace llvm;
+
+Program::Program(Context *ctx)
+: Object(Object::T_Program, ctx), p_type(Invalid), p_state(Empty)
+{
+ p_null_device_dependent.compiler = 0;
+ p_null_device_dependent.device = 0;
+ p_null_device_dependent.linked_module = 0;
+ p_null_device_dependent.program = 0;
+}
+
+Program::~Program()
+{
+ resetDeviceDependent();
+}
+
+void Program::resetDeviceDependent()
+{
+ while (p_device_dependent.size())
+ {
+ DeviceDependent &dep = p_device_dependent.back();
+
+ delete dep.compiler;
+ delete dep.program;
+ delete dep.linked_module;
+
+ p_device_dependent.pop_back();
+ }
+}
+
+void Program::setDevices(cl_uint num_devices, DeviceInterface * const*devices)
+{
+ p_device_dependent.resize(num_devices);
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ DeviceDependent &dep = p_device_dependent[i];
+
+ dep.device = devices[i];
+ dep.program = dep.device->createDeviceProgram(this);
+ dep.is_native_binary = false;
+ dep.linked_module = 0;
+ dep.compiler = new Compiler(dep.device);
+ }
+}
+
+Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device)
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return p_null_device_dependent;
+}
+
+const Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) const
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return p_null_device_dependent;
+}
+
+DeviceProgram *Program::deviceDependentProgram(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.program;
+}
+
+std::string Program::deviceDependentCompilerOptions(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.compiler->options();
+}
+
+std::vector<llvm::Function *> Program::kernelFunctions(DeviceDependent &dep)
+{
+ std::vector<llvm::Function *> rs;
+
+ llvm::NamedMDNode *kernels =
+ dep.linked_module->getNamedMetadata("opencl.kernels");
+
+ if (!kernels) return rs;
+
+ for (unsigned int i=0; i<kernels->getNumOperands(); ++i)
+ {
+ llvm::MDNode *node = kernels->getOperand(i);
+
+ /*---------------------------------------------------------------------
+ * Each node has only one operand : a llvm::Function
+ *--------------------------------------------------------------------*/
+ llvm::Value *value = node->getOperand(0);
+
+ /*---------------------------------------------------------------------
+ * Bug somewhere, don't crash
+ *--------------------------------------------------------------------*/
+ if (!llvm::isa<llvm::Function>(value)) continue;
+
+ llvm::Function *f = llvm::cast<llvm::Function>(value);
+ rs.push_back(f);
+ }
+
+ return rs;
+}
+
+/******************************************************************************
+* Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret)
+******************************************************************************/
+Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret)
+{
+ Kernel *rs = NULL;
+
+ for (size_t i=0; i < kernelList.size(); i++)
+ {
+ if (kernelList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ return kernelList[i];
+ }
+ }
+ /* Now check the previously released list */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ if (kernelReleasedList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ rs = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(rs);
+
+ return rs;
+ }
+ }
+
+ rs = new Kernel(this);
+
+ /*-------------------------------------------------------------------------
+ * Add a function definition for each device
+ *------------------------------------------------------------------------*/
+ for (size_t i=0; i < p_device_dependent.size(); ++i)
+ {
+ bool found = false;
+ DeviceDependent &dep = p_device_dependent[i];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*---------------------------------------------------------------------
+ * Find the one with the good name
+ *--------------------------------------------------------------------*/
+ for (size_t j=0; j < kernels.size(); ++j)
+ {
+ llvm::Function *func = kernels[j];
+
+ if (func->getName().str().compare(name) == 0)
+ {
+ found = true;
+ *errcode_ret = rs->addFunction(dep.device, func,
+ dep.linked_module);
+ if (*errcode_ret != CL_SUCCESS) return rs;
+ break;
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Kernel unavailable for this device
+ *--------------------------------------------------------------------*/
+ if (!found)
+ {
+ *errcode_ret = CL_INVALID_KERNEL_NAME;
+ return rs;
+ }
+ else
+ {
+ kernelList.push_back(rs);
+ }
+ }
+
+ return rs;
+}
+
+Kernel * Program::createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret)
+{
+ Kernel *rs = NULL;
+ /*-------------------------------------------------------------------------
+ * We should never go here
+ *------------------------------------------------------------------------*/
+ if (p_device_dependent.size() == 0) return rs;
+
+
+ for (size_t i=0; i < kernelList.size(); i++)
+ {
+ if (kernelList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ return kernelList[i];
+ }
+ }
+ /* Now check the previously released list */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ if (kernelReleasedList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ rs = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(rs);
+
+ return rs;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Take the list of kernels for the first device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent &dep = p_device_dependent[0];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*-------------------------------------------------------------------------
+ * Create the kernel for each function name
+ * It returns an error if the signature is not the same for every device
+ * or if the kernel isn't found on all the devices.
+ *------------------------------------------------------------------------*/
+ *errcode_ret = CL_SUCCESS;
+
+ for (size_t i=0; i < kernels.size(); ++i)
+ {
+ cl_int result = CL_SUCCESS;
+ Kernel *kernel = createKernel(kernels[i]->getName().str(), &result);
+
+ if (result == CL_SUCCESS)
+ {
+ }
+ else
+ {
+ *errcode_ret = result;
+ delete kernel;
+ }
+ if (kernel->p_name.compare(name) == 0 && result == CL_SUCCESS)
+ {
+ rs = kernel;
+ *errcode_ret = result;
+ }
+ }
+
+ if (!rs && (*errcode_ret == CL_SUCCESS))
+ *errcode_ret = CL_INVALID_KERNEL_NAME;
+
+ return rs;
+}
+
+std::vector<Kernel *> Program::createKernels(cl_int *errcode_ret)
+{
+ std::vector<Kernel *> rs;
+ Kernel *kern = NULL;
+
+ /*-------------------------------------------------------------------------
+ * We should never go here
+ *------------------------------------------------------------------------*/
+ if (p_device_dependent.size() == 0) return rs;
+
+ /*
+ * Resurrect any released kernels back to the kernel list. This handles the
+ * case where clCreateKernelsInProgram() is asking only for a count of kernels in
+ * the currently built program. In that case, KernelList.size() must be the actual
+ * number of kernels compiled into the program (event if they were previously released).
+ */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ kern = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(kern);
+ }
+
+ if (kernelList.size()) return kernelList;
+
+ /*-------------------------------------------------------------------------
+ * Take the list of kernels for the first device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent &dep = p_device_dependent[0];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*-------------------------------------------------------------------------
+ * Create the kernel for each function name
+ * It returns an error if the signature is not the same for every device
+ * or if the kernel isn't found on all the devices.
+ *------------------------------------------------------------------------*/
+ for (size_t i=0; i < kernels.size(); ++i)
+ {
+ cl_int result = CL_SUCCESS;
+ Kernel *kernel = createKernel(kernels[i]->getName().str(), &result);
+
+ if (result == CL_SUCCESS)
+ {
+ kernelList.push_back(kernel);
+ }
+ else
+ {
+ *errcode_ret = result;
+ delete kernel;
+ }
+ }
+
+ return kernelList;
+}
+
+cl_int Program::loadSources(cl_uint count, const char **strings,
+ const size_t *lengths)
+{
+ // Initialize
+ p_source = std::string("");
+
+ // Merge all strings into one big one
+ for (cl_uint i=0; i<count; ++i)
+ {
+ size_t len = 0;
+ const char *data = strings[i];
+
+ if (!data)
+ return CL_INVALID_VALUE;
+
+ // Get the length of the source
+ if (lengths && lengths[i])
+ len = lengths[i];
+ else
+ len = std::strlen(data);
+
+ // Remove trailing \0's, it's not good for sources (it can arise when
+ // the client application wrongly sets lengths
+ while (len > 0 && data[len-1] == 0)
+ len--;
+
+ // Merge the string
+ std::string part(data, len);
+ p_source += part;
+ }
+
+ /*-------------------------------------------------------------------------
+ * temporary for source file cacheing, remove from product releases
+ *------------------------------------------------------------------------*/
+ //source_cache::instance()->remember(p_source);
+
+ p_type = Source;
+ p_state = Loaded;
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::loadBinaries(const unsigned char **data, const size_t *lengths,
+ cl_int *binary_status, cl_uint num_devices,
+ DeviceInterface * const*device_list)
+{
+ // Set device infos
+ setDevices(num_devices, device_list);
+
+ // Load the data
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ DeviceDependent &dep = deviceDependent(device_list[i]);
+ dep.unlinked_binary = std::string((const char *)data[i], lengths[i]);
+ dep.is_native_binary = true;
+
+ /*--------------------------------------------------------------------
+ * Loaded binary is either native code with LLVM bitcode embedded,
+ * or LLVM bitcode itself
+ *--------------------------------------------------------------------*/
+ std::string bitcode;
+ if (! dep.program->ExtractMixedBinary(&dep.unlinked_binary, &bitcode,
+ NULL))
+ {
+ bitcode = dep.unlinked_binary;
+ dep.is_native_binary = false;
+ }
+
+ const llvm::StringRef s_data(bitcode);
+ const llvm::StringRef s_name("<binary>");
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name, false);
+
+ if (!buffer)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ // Make a module of it
+ ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(buffer,
+ llvm::getGlobalContext());
+ if (ModuleOrErr) {
+ dep.linked_module = ModuleOrErr.get();
+ }
+ else {
+ dep.linked_module = NULL;
+ if (binary_status) binary_status[i] = CL_INVALID_VALUE;
+ return CL_INVALID_BINARY;
+ }
+
+ if (binary_status) binary_status[i] = CL_SUCCESS;
+ }
+
+ p_type = Binary;
+ p_state = Loaded;
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::build(const char *options,
+ void (CL_CALLBACK *pfn_notify)(cl_program program,
+ void *user_data),
+ void *user_data, cl_uint num_devices,
+ DeviceInterface * const*device_list)
+{
+ // If we've already built this program and are re-building
+ // (for example, with different user options) then clear out the
+ // device dependent information in preparation for building again.
+ if( p_state == Built) resetDeviceDependent();
+
+ p_state = Failed;
+
+ // Set device infos
+ if (!p_device_dependent.size())
+ {
+ setDevices(num_devices, device_list);
+ }
+
+ // ASW TODO - optimize to compile for each device type only once.
+ for (cl_uint i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &dep = deviceDependent(device_list[i]);
+
+ // Do we need to compile the source for each device ?
+ if (p_type == Source)
+ {
+ // Load source
+ const llvm::StringRef s_data(p_source);
+ const llvm::StringRef s_name("<source>");
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name);
+
+ // Compile
+ int compile_result = dep.compiler->compile(options ? options : std::string(), buffer);
+ if (compile_result)
+ //if (! dep.compiler->compile(options ? options : std::string(),
+ // buffer) )
+ {
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+ if (compile_result == CL_INVALID_BUILD_OPTIONS)
+ return CL_INVALID_BUILD_OPTIONS;
+ else
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+
+ // Get module and its bitcode
+ dep.linked_module = dep.compiler->module();
+
+ llvm::raw_string_ostream ostream(dep.unlinked_binary);
+ llvm::WriteBitcodeToFile(dep.linked_module, ostream);
+ ostream.flush();
+ }
+
+ // Link p_linked_module with the stdlib if the device needs that
+ if (! dep.is_native_binary && dep.program->linkStdLib())
+ {
+ // Load the stdlib bitcode
+ const llvm::StringRef s_data(embed_stdlib_c_bc,
+ sizeof(embed_stdlib_c_bc) - 1);
+ const llvm::StringRef s_name("stdlib.bc");
+ std::string errMsg;
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name, false);
+
+ if (!buffer)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ ErrorOr<Module *> ModuleOrErr =
+ parseBitcodeFile(buffer, llvm::getGlobalContext());
+ Module *stdlib = NULL;
+ if (ModuleOrErr) {
+ stdlib = ModuleOrErr.get();
+ }
+ else {
+ std::error_code EC = ModuleOrErr.getError();
+ errMsg = EC.message();
+ }
+
+ // Link
+ if (!stdlib ||
+ llvm::Linker::LinkModules(dep.linked_module, stdlib,
+ llvm::Linker::DestroySource, &errMsg))
+ {
+ dep.compiler->appendLog("link error: ");
+ dep.compiler->appendLog(errMsg);
+ dep.compiler->appendLog("\n");
+
+ // DEBUG
+ std::cout << dep.compiler->log() << std::endl;
+
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+ }
+
+ if (! dep.is_native_binary)
+ {
+ // Get list of kernels to strip other unused functions
+ std::vector<const char *> api;
+ std::vector<std::string> api_s; // Needed to keep valid data in api
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ for (size_t j=0; j<kernels.size(); ++j)
+ {
+ std::string s = kernels[j]->getName().str();
+ api_s.push_back(s);
+ api.push_back(s.c_str());
+ }
+
+ // determine if module has barrier() function calls
+ bool hasBarrier = false;
+ llvm::CallInst* call;
+ for (llvm::Module::iterator F = dep.linked_module->begin(),
+ EF = dep.linked_module->end(); !hasBarrier && F != EF; ++F)
+ for (llvm::inst_iterator I = inst_begin(*F),
+ E = inst_end(*F); I != E; ++I)
+ {
+ if (!(call = llvm::dyn_cast<llvm::CallInst>(&*I))) continue;
+ if (!call->getCalledFunction()) continue;
+ std::string name(call->getCalledFunction()->getName());
+ if (name == "barrier")
+ {
+ hasBarrier = true;
+ break;
+ }
+ }
+
+ // Optimize code
+ llvm::PassManager *manager = new llvm::PassManager();
+
+ // Common passes (primary goal : remove unused stdlib functions)
+ manager->add(llvm::createTypeBasedAliasAnalysisPass());
+ manager->add(llvm::createBasicAliasAnalysisPass());
+ manager->add(llvm::createInternalizePass(api));
+ manager->add(llvm::createIPSCCPPass());
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createConstantMergePass());
+ manager->add(llvm::createAlwaysInlinerPass());
+
+ dep.program->createOptimizationPasses(manager,
+ dep.compiler->optimize(), hasBarrier);
+
+ manager->add(llvm::createGlobalDCEPass());
+
+ manager->run(*dep.linked_module);
+ delete manager;
+ }
+
+ // Now that the LLVM module is built, build the device-specific
+ // representation
+ if (!dep.program->build(dep.linked_module, &dep.unlinked_binary))
+ {
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+ }
+
+ // TODO: Asynchronous compile
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ p_state = Built;
+
+ return CL_SUCCESS;
+}
+
+Program::Type Program::type() const
+{
+ return p_type;
+}
+
+Program::State Program::state() const
+{
+ return p_state;
+}
+
+cl_int Program::info(cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ llvm::SmallVector<size_t, 4> binary_sizes;
+ llvm::SmallVector<DeviceInterface *, 4> devices;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_PROGRAM_NUM_DEVICES:
+ // Use devices associated with any built kernels, otherwise use
+ // the devices associated with the program context
+ if (p_device_dependent.size() != 0)
+ { SIMPLE_ASSIGN(cl_uint, p_device_dependent.size()); }
+ else
+ return ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ param_value_size, param_value, param_value_size_ret);
+ break;
+
+ case CL_PROGRAM_DEVICES:
+ // Use devices associated with any built kernels, otherwise use
+ // the devices associated with the program context
+ if (p_device_dependent.size() != 0)
+ {
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+
+ devices.push_back(dep.device);
+ }
+
+ value = devices.data();
+ value_length = devices.size() * sizeof(DeviceInterface *);
+ }
+ else
+ return ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ param_value_size, param_value, param_value_size_ret);
+ break;
+
+ case CL_PROGRAM_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_PROGRAM_SOURCE:
+ MEM_ASSIGN(p_source.size() + 1, p_source.c_str());
+ break;
+
+ case CL_PROGRAM_BINARY_SIZES:
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+
+ binary_sizes.push_back(dep.unlinked_binary.size());
+ }
+
+ value = binary_sizes.data();
+ value_length = binary_sizes.size() * sizeof(size_t);
+ break;
+
+ case CL_PROGRAM_BINARIES:
+ {
+ // Special case : param_value points to an array of p_num_devices
+ // application-allocated unsigned char* pointers. Check it's good
+ // and std::memcpy the data
+
+ unsigned char **binaries = (unsigned char **)param_value;
+ value_length = p_device_dependent.size() * sizeof(unsigned char *);
+
+ if (param_value && param_value_size >= value_length)
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+ unsigned char *dest = binaries[i];
+
+ if (!dest)
+ continue;
+
+ std::memcpy(dest, dep.unlinked_binary.data(),
+ dep.unlinked_binary.size());
+ }
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ return CL_SUCCESS;
+ }
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::buildInfo(DeviceInterface *device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ const void *value = 0;
+ size_t value_length = 0;
+ const DeviceDependent &dep = deviceDependent(device);
+
+ union {
+ cl_build_status cl_build_status_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_BUILD_STATUS:
+ switch (p_state)
+ {
+ case Empty:
+ case Loaded:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_NONE);
+ break;
+ case Built:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_SUCCESS);
+ break;
+ case Failed:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_ERROR);
+ break;
+ // TODO: CL_BUILD_IN_PROGRESS
+ }
+ break;
+
+ case CL_PROGRAM_BUILD_OPTIONS:
+ value = dep.compiler->options().c_str();
+ value_length = dep.compiler->options().size() + 1;
+ break;
+
+ case CL_PROGRAM_BUILD_LOG:
+ value = dep.compiler->log().c_str();
+ value_length = dep.compiler->log().size() + 1;
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/program.h b/src/core/program.h
new file mode 100644
index 0000000..a06b452
--- /dev/null
+++ b/src/core/program.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/program.h
+ * \brief Program
+ */
+
+#ifndef __PROGRAM_H__
+#define __PROGRAM_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+ class MemoryBuffer;
+ class Module;
+ class Function;
+}
+
+namespace Coal
+{
+
+class Context;
+class Compiler;
+class DeviceInterface;
+class DeviceProgram;
+class Kernel;
+
+/**
+ * \brief Program object
+ *
+ * This class compiles and links a source or binaries into LLVM modules for each
+ * \c Coal::DeviceInterface for which the program is built.
+ *
+ * It then contains functions to get the list of kernels available in the
+ * program, using \c Coal::Kernel objects.
+ */
+class Program : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ */
+ Program(Context *ctx);
+ ~Program();
+
+ /**
+ * \brief Program type
+ */
+ enum Type
+ {
+ Invalid, /*!< Invalid or unknown, type of a program not already loaded */
+ Source, /*!< Program made of sources that must be compiled and linked */
+ Binary /*!< Program made of pre-built binaries that only need to be (transformed)/linked */
+ };
+
+ /**
+ * \brief Program state
+ */
+ enum State
+ {
+ Empty, /*!< Just created */
+ Loaded, /*!< Source or binary loaded */
+ Built, /*!< Built */
+ Failed, /*!< Build failed */
+ };
+
+ /**
+ * \brief Load sources into the program
+ *
+ * This function loads the source-code given in \p strings into the
+ * program and sets its type to \c Source.
+ *
+ * \param count number of strings in \p strings
+ * \param strings array of pointers to strings, either null-terminated
+ * or of length given in \p lengths
+ * \param lengths lengths of the strings. If a field is 0, the
+ * corresponding string is null-terminated. If \p lengths is
+ * 0, all the strings are null-terminated
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int loadSources(cl_uint count, const char **strings,
+ const size_t *lengths);
+
+ /**
+ * \brief Load binaries into the program
+ *
+ * This function allows client application to load a source, retrieve
+ * binaries using \c buildInfo(), and then re-create the same program
+ * (after a restart for example) by giving it a precompiled binary.
+ *
+ * This function loads the binaries for each device and parse them into
+ * LLVM modules, then sets the program type to \c Binary or
+ * \c NativeBinary.
+ *
+ * \param data array of pointers to binaries, one for each device
+ * \param lengths lengths of the binaries pointed to by \p data
+ * \param binary_status array that will be filled by this function with
+ * the status of each loaded binary (\c CL_SUCCESS if success)
+ * \param num_devices number of devices for which a binary is loaded
+ * \param device_list list of devices for which the binaries are loaded
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int loadBinaries(const unsigned char **data, const size_t *lengths,
+ cl_int *binary_status, cl_uint num_devices,
+ DeviceInterface * const*device_list);
+
+ /**
+ * \brief Build the program
+ *
+ * This function compiles the sources, if any, and then link the
+ * resulting binaries if the devices for which they are compiled asks
+ * \c Coal::Program to do so, using \c Coal::DeviceProgram::linkStdLib().
+ *
+ * \param options options to pass to the compiler, see the OpenCL
+ * specification.
+ * \param pfn_notify callback function called at the end of the build
+ * \param user_data user data given to \p pfn_notify
+ * \param num_devices number of devices for which binaries are being
+ * built. If it's a source-based program, this can be 0.
+ * \param device_list list of devices for which the program will be built.
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int build(const char *options,
+ void (CL_CALLBACK *pfn_notify)(cl_program program,
+ void *user_data),
+ void *user_data, cl_uint num_devices,
+ DeviceInterface * const*device_list);
+
+ Type type() const; /*!< \brief Type of the program */
+ State state() const; /*!< \brief State of the program */
+
+ /**
+ * \brief Create a kernel given a \p name
+ * \param name name of the kernel to be created
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return a \c Coal::Kernel object corresponding to the given \p name
+ */
+ Kernel *createKernel(const std::string &name, cl_int *errcode_ret);
+
+ /**
+ * \brief Create kernels of the program and return given a \p name
+ * \param name name of the kernel to be returned
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return a \c Coal::Kernel object corresponding to the given \p name
+ */
+ Kernel *createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret);
+
+ /**
+ * \brief Create all the kernels of the program
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return the list of \c Coal::Kernel objects of this program
+ */
+ std::vector<Kernel *> createKernels(cl_int *errcode_ret);
+
+ /**
+ * \brief Device-specific program
+ * \param device device for which the device-specific program is needed
+ * \return the device-specific program requested, 0 if not found
+ */
+ DeviceProgram *deviceDependentProgram(DeviceInterface *device) const;
+ std::string deviceDependentCompilerOptions(DeviceInterface *device) const;
+
+ /**
+ * \brief Get information about this program
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Get build info about this program (log, binaries, etc)
+ * \copydetails Coal::DeviceInterface::info
+ * \param device \c Coal::DeviceInterface for which info is needed
+ */
+ cl_int buildInfo(DeviceInterface *device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ std::string source() { return p_source; }
+
+ std::vector<Kernel *> kernelList;
+ std::vector<Kernel *> kernelReleasedList;
+
+ private:
+ Type p_type;
+ State p_state;
+ std::string p_source;
+
+ struct DeviceDependent
+ {
+ DeviceInterface * device;
+ DeviceProgram * program;
+ std::string unlinked_binary;
+ bool is_native_binary; // llvm kernel bitcode vs final native binary
+ llvm::Module * linked_module;
+ Compiler * compiler;
+ };
+
+ std::vector<DeviceDependent> p_device_dependent;
+ DeviceDependent p_null_device_dependent;
+
+ void setDevices(cl_uint num_devices, DeviceInterface * const*devices);
+ void resetDeviceDependent();
+ DeviceDependent &deviceDependent(DeviceInterface *device);
+ const DeviceDependent &deviceDependent(DeviceInterface *device) const;
+ std::vector<llvm::Function *> kernelFunctions(DeviceDependent &dep);
+};
+
+}
+
+struct _cl_program : public Coal::Program
+{};
+
+#endif
diff --git a/src/core/propertylist.h b/src/core/propertylist.h
new file mode 100644
index 0000000..8d32397
--- /dev/null
+++ b/src/core/propertylist.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file propertylist.h
+ * \brief Helper macros for \c info() functions
+ *
+ * The OpenCL API is full of functions like \c clGetXXXInfo(). They all take
+ * the same arguments and are handled the same way. This file contains macros
+ * easing the implementation of these info functions.
+ *
+ * One info function, using these macros, looks like that:
+ *
+ * \code
+ * cl_int Foo::info(cl_foo_info param_name,
+ * size_t param_value_size,
+ * void *param_value,
+ * size_t *param_value_size_ret) const
+ * {
+ * void *value = 0;
+ * size_t value_length = 0;
+ *
+ * union {
+ * cl_uint cl_uint_var;
+ * cl_context cl_context_var;
+ * };
+ *
+ * switch (param_name)
+ * {
+ * case CL_UINT_PARAM:
+ * SIMPLE_ASSIGN(cl_uint, the_value);
+ * break;
+ * case CL_CONTEXT_PARAM:
+ * SIMPLE_ASSIGN(cl_context, a_call());
+ * break;
+ * case CL_STRING_PARAM:
+ * STRING_ASSIGN("This is a string");
+ * break;
+ * case CL_BINARY_PARAM:
+ * MEM_ASSIGN(sizeof(something), something);
+ * break;
+ * default:
+ * return CL_INVALID_VALUE;
+ * }
+ *
+ * if (param_value && param_value_size < value_length)
+ * return CL_INVALID_VALUE;
+ *
+ * if (param_value_size_ret)
+ * *param_value_size_ret = value_length;
+ *
+ * if (param_value)
+ * std::memcpy(param_value, value, value_length);
+ *
+ * return CL_SUCCESS;
+ * }
+ * \endcode
+ */
+
+#ifndef __PROPERTYLIST_H__
+#define __PROPERTYLIST_H__
+
+/**
+ * \brief Assign a value of a given type to the return value
+ * \param type type of the argument
+ * \param _value value to assign
+ */
+#define SIMPLE_ASSIGN(type, _value) do { \
+ value_length = sizeof(type); \
+ type##_var = (type)_value; \
+ value = & type##_var; \
+} while (0);
+
+/**
+ * \brief Assign a string to the return value
+ * \param string the string to assign, as a constant
+ */
+#define STRING_ASSIGN(string) do { \
+ static const char str[] = string; \
+ value_length = sizeof(str); \
+ value = (void *)str; \
+} while (0);
+
+/**
+ * \brief Assign a memory buffer to the return value
+ * \note the buffer must remain valid after the end of the \c info() call
+ * \param size size of the buffer
+ * \param buf buffer (of type <tt>void *</tt> for instance)
+ */
+#define MEM_ASSIGN(size, buf) do { \
+ value_length = size; \
+ value = (void *)buf; \
+} while (0);
+
+#endif
diff --git a/src/core/sampler.cpp b/src/core/sampler.cpp
new file mode 100644
index 0000000..71fca86
--- /dev/null
+++ b/src/core/sampler.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/sampler.cpp
+ * \brief Sampler
+ */
+
+#include "sampler.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+
+#include <cstring>
+#include <cstdlib>
+
+using namespace Coal;
+
+Sampler::Sampler(Context *ctx,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int *errcode_ret)
+: Object(Object::T_Sampler, ctx), p_bitfield(0)
+{
+ if (normalized_coords)
+ p_bitfield |= CLK_NORMALIZED_COORDS_TRUE;
+ else
+ p_bitfield |= CLK_NORMALIZED_COORDS_FALSE;
+
+ switch (addressing_mode)
+ {
+ case CL_ADDRESS_NONE:
+ p_bitfield |= CLK_ADDRESS_NONE;
+ break;
+
+ case CL_ADDRESS_MIRRORED_REPEAT:
+ p_bitfield |= CLK_ADDRESS_MIRRORED_REPEAT;
+ break;
+
+ case CL_ADDRESS_REPEAT:
+ p_bitfield |= CLK_ADDRESS_REPEAT;
+ break;
+
+ case CL_ADDRESS_CLAMP_TO_EDGE:
+ p_bitfield |= CLK_ADDRESS_CLAMP_TO_EDGE;
+ break;
+
+ case CL_ADDRESS_CLAMP:
+ p_bitfield |= CLK_ADDRESS_CLAMP;
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ switch (filter_mode)
+ {
+ case CL_FILTER_NEAREST:
+ p_bitfield |= CLK_FILTER_NEAREST;
+ break;
+
+ case CL_FILTER_LINEAR:
+ p_bitfield |= CLK_FILTER_LINEAR;
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check that images are available on all the devices
+ *errcode_ret = checkImageAvailability();
+}
+
+Sampler::Sampler(Context *ctx, unsigned int bitfield)
+: Object(Object::T_Sampler, ctx), p_bitfield(bitfield)
+{
+ checkImageAvailability();
+}
+
+cl_int Sampler::checkImageAvailability() const
+{
+ cl_uint num_devices;
+ DeviceInterface **devices;
+ cl_int rs;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ sizeof(unsigned int),
+ &num_devices, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ devices = (DeviceInterface **)std::malloc(num_devices *
+ sizeof(DeviceInterface *));
+
+ if (!devices)
+ {
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ num_devices * sizeof(DeviceInterface *),
+ devices, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ for (unsigned int i=0; i<num_devices; ++i)
+ {
+ cl_bool image_support;
+
+ rs = devices[i]->info(CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
+ &image_support, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ if (!image_support)
+ {
+ std::free((void *)devices);
+ return CL_INVALID_OPERATION;
+ }
+ }
+
+ std::free((void *)devices);
+
+ return CL_SUCCESS;
+}
+
+unsigned int Sampler::bitfield() const
+{
+ return p_bitfield;
+}
+
+cl_int Sampler::info(cl_sampler_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ cl_bool cl_bool_var;
+ cl_addressing_mode cl_addressing_mode_var;
+ cl_filter_mode cl_filter_mode_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_SAMPLER_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_SAMPLER_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_SAMPLER_NORMALIZED_COORDS:
+ if (p_bitfield & CLK_NORMALIZED_COORDS_MASK)
+ SIMPLE_ASSIGN(cl_bool, true)
+ else
+ SIMPLE_ASSIGN(cl_bool, false);
+ break;
+
+ case CL_SAMPLER_ADDRESSING_MODE:
+ switch (p_bitfield & CLK_ADDRESS_MODE_MASK)
+ {
+ case CLK_ADDRESS_CLAMP:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP);
+ break;
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP_TO_EDGE);
+ break;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_MIRRORED_REPEAT);
+ break;
+ case CLK_ADDRESS_REPEAT:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_REPEAT);
+ break;
+ case CLK_ADDRESS_NONE:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_NONE);
+ break;
+ }
+ break;
+
+ case CL_SAMPLER_FILTER_MODE:
+ switch (p_bitfield & CLK_FILTER_MASK)
+ {
+ case CLK_FILTER_LINEAR:
+ SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_LINEAR);
+ break;
+ case CLK_FILTER_NEAREST:
+ SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_NEAREST);
+ break;
+ }
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/sampler.h b/src/core/sampler.h
new file mode 100644
index 0000000..1ff1f1f
--- /dev/null
+++ b/src/core/sampler.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file sampler.h
+ * \brief Sampler object
+ */
+
+#ifndef __SAMPLER_H__
+#define __SAMPLER_H__
+
+#include <CL/cl.h>
+#include "object.h"
+
+// WARNING: Keep in sync with stdlib.h
+
+#define CLK_NORMALIZED_COORDS_FALSE 0x00000000
+#define CLK_NORMALIZED_COORDS_TRUE 0x00000001
+#define CLK_ADDRESS_NONE 0x00000000
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010
+#define CLK_ADDRESS_REPEAT 0x00000020
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x00000030
+#define CLK_ADDRESS_CLAMP 0x00000040
+#define CLK_FILTER_NEAREST 0x00000000
+#define CLK_FILTER_LINEAR 0x00000100
+
+#define CLK_NORMALIZED_COORDS_MASK 0x0000000f
+#define CLK_ADDRESS_MODE_MASK 0x000000f0
+#define CLK_FILTER_MASK 0x00000f00
+
+namespace Coal
+{
+
+class Context;
+
+/**
+ * \brief Sampler
+ *
+ * This object doesn't do anything intersting, it only converts a set of
+ * host OpenCL constants to constants that will be used by the kernels and
+ * the image reading and writing built-in functions.
+ */
+class Sampler : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param normalized_coords true if the coords given to the built-in
+ * image functions are normalized, false otherwise
+ * \param addressing_mode addressing mode used to read images
+ * \param filter_mode filter mode used to read images
+ * \param errcode_ret return code (\c CL_SUCCESS if all is good)
+ */
+ Sampler(Context *ctx,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int *errcode_ret);
+
+ /**
+ * \brief Simpler constructor
+ * \param ctx parent \c Coal::Context
+ * \param bitfield bitfield already calculated
+ */
+ Sampler(Context *ctx,
+ unsigned int bitfield);
+
+ unsigned int bitfield() const; /*!< \brief Bitfield value usable by the kernels */
+
+ /**
+ * \brief Get information about the sampler
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_sampler_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ private:
+ unsigned int p_bitfield;
+
+ cl_int checkImageAvailability() const;
+};
+
+}
+
+struct _cl_sampler : public Coal::Sampler
+{};
+
+#endif
diff --git a/src/core/util.cpp b/src/core/util.cpp
new file mode 100644
index 0000000..afeb564
--- /dev/null
+++ b/src/core/util.cpp
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**
+ * \file core/util.c
+ * \brief misc utils
+ */
+
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+
+/******************************************************************************
+* Parse first line in a file, read integer immediately following a string
+******************************************************************************/
+uint32_t parse_file_line_value(const char *fname, const char *sname,
+ uint32_t default_val)
+{
+ uint32_t val = default_val;
+ FILE *fp = NULL;
+ char *line = NULL;
+ char *str = NULL;
+ size_t len = 0;
+
+ if ((fp = fopen(fname, "r")) == NULL) return val;
+ if (getline(&line, &len, fp) != -1)
+ {
+ if ((str = strstr(line, sname)) != NULL)
+ {
+ str += strlen(sname);
+ while(!isdigit(*str) && *str != '\0') str++;
+ if (*str != '\0') val = atoi(str);
+ }
+ }
+
+ if (fp != NULL) fclose(fp);
+ if (line != NULL) free(line);
+ return val;
+}
+
diff --git a/src/core/util.h b/src/core/util.h
new file mode 100644
index 0000000..f2c1609
--- /dev/null
+++ b/src/core/util.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**
+ * \file core/util.h
+ * \brief misc utils
+ */
+
+#ifndef _UTIL_H
+#define _UTIL_H
+
+// Parse first line in a file, read integer immediately following a string
+uint32_t parse_file_line_value(const char *fname, const char *sname,
+ uint32_t default_val);
+
+#endif // _UTIL_H
+
diff --git a/src/llvmopencl/AllocasToEntry.cc b/src/llvmopencl/AllocasToEntry.cc
new file mode 100644
index 0000000..79bbe63
--- /dev/null
+++ b/src/llvmopencl/AllocasToEntry.cc
@@ -0,0 +1,74 @@
+// Header for AllocasToEntry, an LLVM pass to move allocas to the function
+// entry node.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#ifdef LLVM_3_2
+# include <llvm/Instructions.h>
+#else
+# include <llvm/IR/Instructions.h>
+#endif
+
+#include "AllocasToEntry.h"
+
+namespace pocl {
+
+using namespace llvm;
+
+namespace {
+ static
+ RegisterPass<pocl::AllocasToEntry> X("allocastoentry",
+ "Move allocas to the function entry node.");
+}
+
+char AllocasToEntry::ID = 0;
+
+
+AllocasToEntry::AllocasToEntry() : FunctionPass(ID)
+{
+}
+
+bool
+AllocasToEntry::runOnFunction(Function &F)
+{
+ // This solves problem with dynamic stack objects that are
+ // not supported by some targets (TCE).
+ Function::iterator I = F.begin();
+ Instruction *firstInsertionPt = (I++)->getFirstInsertionPt();
+
+ bool changed = false;
+ for (Function::iterator E = F.end(); I != E; ++I) {
+ for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+ AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+ if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+ allocaInst->moveBefore(firstInsertionPt);
+ changed = true;
+ }
+ }
+ }
+ return changed;
+}
+
+}
diff --git a/src/llvmopencl/AllocasToEntry.h b/src/llvmopencl/AllocasToEntry.h
new file mode 100644
index 0000000..a92fa14
--- /dev/null
+++ b/src/llvmopencl/AllocasToEntry.h
@@ -0,0 +1,49 @@
+// Header for AllocasToEntry, an LLVM pass to move allocas to the function
+// entry node.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_ALLOCAS_TO_ENTRY_H
+#define _POCL_ALLOCAS_TO_ENTRY_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace pocl {
+ class AllocasToEntry : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ AllocasToEntry();
+ virtual ~AllocasToEntry() {};
+
+ virtual bool runOnFunction(llvm::Function &F);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Barrier.h b/src/llvmopencl/Barrier.h
new file mode 100644
index 0000000..e1b612f
--- /dev/null
+++ b/src/llvmopencl/Barrier.h
@@ -0,0 +1,121 @@
+// Class for barrier instructions, modelled as a CallInstr.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <cstdio>
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "llvm/Support/Casting.h"
+
+#define BARRIER_FUNCTION_NAME "barrier"
+
+namespace pocl {
+
+ class Barrier : public llvm::CallInst {
+
+ public:
+ static void GetBarriers(llvm::SmallVectorImpl<Barrier *> &B,
+ llvm::Module &M) {
+ llvm::Function *F = M.getFunction(BARRIER_FUNCTION_NAME);
+ if (F != NULL) {
+ for (llvm::Function::use_iterator i = F->use_begin(), e = F->use_end();
+ i != e; ++i)
+ B.push_back(llvm::cast<Barrier>(*i));
+ }
+ }
+ /**
+ * Creates a new barrier before the given instruction.
+ *
+ * If there was already a barrier there, returns the old one.
+ */
+ static Barrier *Create(llvm::Instruction *InsertBefore) {
+ llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
+
+ if (InsertBefore != &InsertBefore->getParent()->front() &&
+ llvm::isa<Barrier>(InsertBefore->getPrevNode()))
+ return llvm::cast<Barrier>(InsertBefore->getPrevNode());
+
+ llvm::Type *Int32Type = llvm::Type::getInt32Ty(M->getContext());
+ llvm::Function *F = llvm::cast<llvm::Function>
+ (M->getOrInsertFunction(BARRIER_FUNCTION_NAME,
+ llvm::Type::getVoidTy(M->getContext()),
+ Int32Type,
+ NULL));
+ llvm::SmallVector<llvm::Value *, 4> argsarray;
+ argsarray.push_back(llvm::ConstantInt::get(Int32Type, 0));
+ llvm::ArrayRef<llvm::Value *> args(argsarray);
+ return llvm::cast<pocl::Barrier>
+ (llvm::CallInst::Create(F, args, "", InsertBefore));
+ }
+ static bool classof(const Barrier *) { return true; };
+ static bool classof(const llvm::CallInst *C) {
+ return C->getCalledFunction() != NULL &&
+ C->getCalledFunction()->getName() == BARRIER_FUNCTION_NAME;
+ }
+ static bool classof(const Instruction *I) {
+ return (llvm::isa<llvm::CallInst>(I) &&
+ classof(llvm::cast<llvm::CallInst>(I)));
+ }
+ static bool classof(const User *U) {
+ return (llvm::isa<Instruction>(U) &&
+ classof(llvm::cast<llvm::Instruction>(U)));
+ }
+
+
+ static bool hasOnlyBarrier(const llvm::BasicBlock *bb)
+ {
+ return endsWithBarrier(bb) && bb->size() == 2;
+ }
+
+ static bool hasBarrier(const llvm::BasicBlock *bb)
+ {
+ for (llvm::BasicBlock::const_iterator i = bb->begin(), e = bb->end();
+ i != e; ++i)
+ {
+ if (llvm::isa<Barrier>(i)) return true;
+ }
+ return false;
+ }
+
+ // returns true in case the given basic block ends with a barrier,
+ // that is, contains only a branch instruction after a barrier call
+ static bool endsWithBarrier(const llvm::BasicBlock *bb)
+ {
+ const llvm::TerminatorInst *t = bb->getTerminator();
+ if (t == NULL) return false;
+ return bb->size() > 1 && t->getPrevNode() != NULL &&
+ llvm::isa<Barrier>(t->getPrevNode());
+ }
+ };
+
+}
+
diff --git a/src/llvmopencl/BarrierBlock.cc b/src/llvmopencl/BarrierBlock.cc
new file mode 100644
index 0000000..d254fa6
--- /dev/null
+++ b/src/llvmopencl/BarrierBlock.cc
@@ -0,0 +1,73 @@
+// Class for a basic block that just contains a barrier.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "BarrierBlock.h"
+#include "Barrier.h"
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/Instructions.h"
+#endif
+#include <cassert>
+
+using namespace llvm;
+using namespace pocl;
+
+static bool
+verify(const BasicBlock *B);
+
+bool
+BarrierBlock::classof(const BasicBlock *B)
+{
+ if ((B->size() == 2) &&
+ isa<Barrier> (&B->front())) {
+ assert(verify(B));
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+verify(const BasicBlock *B)
+{
+ assert((B->size() == 2) && "Barriers blocks should have no functionality!");
+ // const Instruction *barrier = B->getFirstNonPHI();
+ // assert(isa<Barrier>(barrier) && "Barriers blocks should have no functionality!");
+ // assert(B->getTerminator()->getPrevNode() == barrier &&
+ // "Barriers blocks should have no functionality!");
+#if 1 // We want to allow barriers with more than one predecessors (?)
+ // (for loop header barriers).
+ assert(((B->getSinglePredecessor() != NULL) ||
+ (B == &(B->getParent()->front()))) &&
+ "Barrier blocks should have exactly one predecessor (except entry barrier)!");
+#endif
+#if 0 // We want to allow barriers with more than one successor (for latch barriers).
+ assert((B->getTerminator()->getNumSuccessors() <= 1) &&
+ "Barrier blocks should have one successor, or zero for exit barriers!");
+#endif
+ assert(isa<Barrier>(B->front()));
+
+ return true;
+}
+
diff --git a/src/llvmopencl/BarrierBlock.h b/src/llvmopencl/BarrierBlock.h
new file mode 100644
index 0000000..6246751
--- /dev/null
+++ b/src/llvmopencl/BarrierBlock.h
@@ -0,0 +1,44 @@
+// Class for a basic block that just contains a barrier.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/BasicBlock.h"
+#else
+#include "llvm/IR/BasicBlock.h"
+#endif
+
+#ifndef _POCL_BARRIER_BLOCK_H
+#define _POCL_BARRIER_BLOCK_H
+
+namespace pocl {
+
+ class BarrierBlock : public llvm::BasicBlock {
+
+ public:
+ static bool classof(const BarrierBlock *) { return true; };
+ static bool classof(const llvm::BasicBlock *B);
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/BarrierTailReplication.cc b/src/llvmopencl/BarrierTailReplication.cc
new file mode 100644
index 0000000..12bac74
--- /dev/null
+++ b/src/llvmopencl/BarrierTailReplication.cc
@@ -0,0 +1,421 @@
+// LLVM function pass to replicate barrier tails (successors to barriers).
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "BarrierTailReplication.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#endif
+
+#include <iostream>
+#include <algorithm>
+
+using namespace llvm;
+using namespace pocl;
+
+//#define DEBUG_BARRIER_REPL
+
+static bool block_has_barrier(const BasicBlock *bb);
+
+namespace {
+ static
+ RegisterPass<BarrierTailReplication> X("barriertails",
+ "Barrier tail replication pass");
+}
+
+char BarrierTailReplication::ID = 0;
+
+void
+BarrierTailReplication::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+ AU.addRequired<LoopInfo>();
+ AU.addPreserved<LoopInfo>();
+}
+
+bool
+BarrierTailReplication::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### BTR on " << F.getName().str() << std::endl;
+#endif
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+
+ DT->verifyAnalysis();
+ LI->verifyAnalysis();
+
+ /* The created tails might contain PHI nodes with operands
+ referring to the non-predecessor (split point) BB.
+ These must be cleaned to avoid breakage later on.
+ */
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ llvm::BasicBlock *bb = i;
+ changed |= CleanupPHIs(bb);
+ }
+
+ return changed;
+}
+
+bool
+BarrierTailReplication::ProcessFunction(Function &F)
+{
+ BasicBlockSet processed_bbs;
+
+ return FindBarriersDFS(&F.getEntryBlock(), processed_bbs);
+}
+
+
+// Recursively (depht-first) look for barriers in all possible
+// execution paths starting on entry, replicating the barrier
+// successors to ensure there is a separate function exit BB
+// for each combination of traversed barriers. The set
+// processed_bbs stores the
+bool
+BarrierTailReplication::FindBarriersDFS(BasicBlock *bb,
+ BasicBlockSet &processed_bbs)
+{
+ bool changed = false;
+
+ // Check if we already visited this BB (to avoid
+ // infinite recursion in case of unbarriered loops).
+ if (processed_bbs.count(bb) != 0)
+ return changed;
+
+ processed_bbs.insert(bb);
+
+ if (block_has_barrier(bb)) {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### block " << bb->getName().str() << " has barrier, RJS" << std::endl;
+#endif
+ BasicBlockSet processed_bbs_rjs;
+ changed = ReplicateJoinedSubgraphs(bb, bb, processed_bbs_rjs);
+ }
+
+ TerminatorInst *t = bb->getTerminator();
+
+ // Find barriers in the successors (depth first).
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i)
+ changed |= FindBarriersDFS(t->getSuccessor(i), processed_bbs);
+
+ return changed;
+}
+
+
+// Only replicate those parts of the subgraph that are not
+// dominated by a (barrier) basic block, to avoid excesive
+// (and confusing) code replication.
+bool
+BarrierTailReplication::ReplicateJoinedSubgraphs(BasicBlock *dominator,
+ BasicBlock *subgraph_entry,
+ BasicBlockSet &processed_bbs)
+{
+ bool changed = false;
+
+ assert(DT->dominates(dominator, subgraph_entry));
+
+ Function *f = dominator->getParent();
+
+ TerminatorInst *t = subgraph_entry->getTerminator();
+ for (int i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *b = t->getSuccessor(i);
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### traversing from " << subgraph_entry->getName().str()
+ << " to " << b->getName().str() << std::endl;
+#endif
+
+ // Check if we already handled this BB and all its branches.
+ if (processed_bbs.count(b) != 0)
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### already processed " << std::endl;
+#endif
+ continue;
+ }
+
+ const bool isBackedge = DT->dominates(b, subgraph_entry);
+ if (isBackedge) {
+ // This is a loop backedge. Do not find subgraphs across
+ // those.
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### a loop backedge, skipping" << std::endl;
+#endif
+ continue;
+ }
+ if (DT->dominates(dominator, b))
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### " << dominator->getName().str() << " dominates "
+ << b->getName().str() << std::endl;
+#endif
+ changed |= ReplicateJoinedSubgraphs(dominator, b, processed_bbs);
+ }
+ else
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### " << dominator->getName().str() << " does not dominate "
+ << b->getName().str() << " replicating " << std::endl;
+#endif
+ BasicBlock *replicated_subgraph_entry =
+ ReplicateSubgraph(b, f);
+ t->setSuccessor(i, replicated_subgraph_entry);
+ changed = true;
+ }
+
+ if (changed)
+ {
+ // We have modified the function. Possibly created new loops.
+ // Update analysis passes.
+ DT->runOnFunction(*f);
+ #ifdef LLVM_3_1
+ LI->getBase().Calculate(DT->getBase());
+ #else
+ LI->runOnFunction(*f);
+ #endif
+ }
+ }
+ processed_bbs.insert(subgraph_entry);
+ return changed;
+}
+
+// Removes phi elements for which there are no successors (anymore).
+bool
+BarrierTailReplication::CleanupPHIs(llvm::BasicBlock *BB)
+{
+
+ bool changed = false;
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### CleanupPHIs for BB:" << std::endl;
+ BB->dump();
+#endif
+
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; )
+ {
+ PHINode *PN = dyn_cast<PHINode>(BI);
+ if (PN == NULL) break;
+
+ bool PHIRemoved = false;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ {
+ bool isSuccessor = false;
+ // find if the predecessor branches to this one (anymore)
+ for (unsigned s = 0,
+ se = PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors();
+ s < se; ++s) {
+ if (PN->getIncomingBlock(i)->getTerminator()->getSuccessor(s) == BB)
+ {
+ isSuccessor = true;
+ break;
+ }
+ }
+ if (!isSuccessor)
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "removing incoming value " << i << " from PHINode:" << std::endl;
+ PN->dump();
+#endif
+ PN->removeIncomingValue(i, true);
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "now:" << std::endl;
+ PN->dump();
+#endif
+ changed = true;
+ e--;
+ if (e == 0)
+ {
+ PHIRemoved = true;
+ break;
+ }
+ i = 0;
+ continue;
+ }
+ }
+ if (PHIRemoved)
+ BI = BB->begin();
+ else
+ BI++;
+ }
+ return changed;
+}
+
+BasicBlock *
+BarrierTailReplication::ReplicateSubgraph(BasicBlock *entry,
+ Function *f)
+{
+ // Find all basic blocks to replicate.
+ BasicBlockVector subgraph;
+ FindSubgraph(subgraph, entry);
+
+ // Replicate subgraph maintaining control flow.
+ BasicBlockVector v;
+
+ ValueToValueMapTy m;
+ ReplicateBasicBlocks(v, m, subgraph, f);
+ UpdateReferences(v, m);
+
+ // Return entry block of replicated subgraph.
+ return cast<BasicBlock>(m[entry]);
+}
+
+
+void
+BarrierTailReplication::FindSubgraph(BasicBlockVector &subgraph,
+ BasicBlock *entry)
+{
+ // The subgraph can have internal branches (join points)
+ // avoid replicating these parts multiple times within the
+ // same tail.
+ if (std::count(subgraph.begin(), subgraph.end(), entry) > 0)
+ return;
+
+ subgraph.push_back(entry);
+
+ const TerminatorInst *t = entry->getTerminator();
+ Loop *l = LI->getLoopFor(entry);
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *successor = t->getSuccessor(i);
+ const bool isBackedge = DT->dominates(successor, entry);
+ if (isBackedge) continue;
+ FindSubgraph(subgraph, successor);
+ }
+}
+
+
+void
+BarrierTailReplication::ReplicateBasicBlocks(BasicBlockVector &new_graph,
+ ValueToValueMapTy &reference_map,
+ BasicBlockVector &graph,
+ Function *f)
+{
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### ReplicateBasicBlocks: " << std::endl;
+#endif
+ for (BasicBlockVector::const_iterator i = graph.begin(),
+ e = graph.end();
+ i != e; ++i) {
+ BasicBlock *b = *i;
+ BasicBlock *new_b = BasicBlock::Create(b->getContext(),
+ b->getName() + ".btr",
+ f);
+ reference_map.insert(std::make_pair(b, new_b));
+ new_graph.push_back(new_b);
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "Replicated BB: " << new_b->getName().str() << std::endl;
+#endif
+
+ for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
+ i2 != e2; ++i2) {
+ Instruction *i = i2->clone();
+ reference_map.insert(std::make_pair(i2, i));
+ new_b->getInstList().push_back(i);
+ }
+
+ // Add predicates to PHINodes of basic blocks the replicated
+ // block jumps to (backedges).
+ TerminatorInst *t = new_b->getTerminator();
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *successor = t->getSuccessor(i);
+ if (std::count(graph.begin(), graph.end(), successor) == 0) {
+ // Successor is not in the graph, possible backedge.
+ for (BasicBlock::iterator i = successor->begin(), e = successor->end();
+ i != e; ++i) {
+ PHINode *phi = dyn_cast<PHINode>(i);
+ if (phi == NULL)
+ break; // All PHINodes already checked.
+
+ // Get value for original incoming edge and add new predicate.
+ Value *v = phi->getIncomingValueForBlock(b);
+ Value *new_v = reference_map[v];
+ if (new_v == NULL) {
+ /* This case can happen at least when replicating a latch
+ block in a b-loop. The value produced might be from a common
+ path before the replicated part. Then just use the original value.*/
+ new_v = v;
+#if 0
+ std::cerr << "### could not find a replacement block for phi node ("
+ << b->getName().str() << ")" << std::endl;
+ phi->dump();
+ v->dump();
+ f->viewCFG();
+ assert (0);
+#endif
+ }
+ phi->addIncoming(new_v, new_b);
+ }
+ }
+ }
+ }
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << std::endl;
+#endif
+}
+
+
+void
+BarrierTailReplication::UpdateReferences(const BasicBlockVector &graph,
+ ValueToValueMapTy &reference_map)
+{
+ for (BasicBlockVector::const_iterator i = graph.begin(),
+ e = graph.end();
+ i != e; ++i) {
+ BasicBlock *b = *i;
+ for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
+ i2 != e2; ++i2) {
+ Instruction *i = i2;
+ RemapInstruction(i, reference_map,
+ RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+ }
+ }
+}
+
+
+static bool
+block_has_barrier(const BasicBlock *bb)
+{
+ for (BasicBlock::const_iterator i = bb->begin(), e = bb->end();
+ i != e; ++i) {
+ if (isa<Barrier>(i))
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/llvmopencl/BarrierTailReplication.h b/src/llvmopencl/BarrierTailReplication.h
new file mode 100644
index 0000000..7e3beb0
--- /dev/null
+++ b/src/llvmopencl/BarrierTailReplication.h
@@ -0,0 +1,85 @@
+// Header for BarrierTailReplication.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_BARRIER_TAIL_REPLICATION
+#define POCL_BARRIER_TAIL_REPLICATION
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+namespace pocl {
+ class Workgroup;
+
+ class BarrierTailReplication : public llvm::FunctionPass {
+
+ public:
+ static char ID;
+
+ BarrierTailReplication(): FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+ typedef std::set<llvm::BasicBlock *> BasicBlockSet;
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap;
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+
+ bool ProcessFunction(llvm::Function &F);
+ bool FindBarriersDFS(llvm::BasicBlock *bb,
+ BasicBlockSet &processed_bbs);
+ bool ReplicateJoinedSubgraphs(llvm::BasicBlock *dominator,
+ llvm::BasicBlock *subgraph_entry,
+ BasicBlockSet &processed_bbs);
+
+ llvm::BasicBlock* ReplicateSubgraph(llvm::BasicBlock *entry,
+ llvm::Function *f);
+ void FindSubgraph(BasicBlockVector &subgraph,
+ llvm::BasicBlock *entry);
+ void ReplicateBasicBlocks(BasicBlockVector &new_graph,
+ llvm::ValueToValueMapTy &reference_map,
+ BasicBlockVector &graph,
+ llvm::Function *f);
+ void UpdateReferences(const BasicBlockVector &graph,
+ llvm::ValueToValueMapTy &reference_map);
+
+ bool CleanupPHIs(llvm::BasicBlock *BB);
+
+ friend class pocl::Workgroup;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/BreakConstantGEPs.cpp b/src/llvmopencl/BreakConstantGEPs.cpp
new file mode 100644
index 0000000..a12aaaa
--- /dev/null
+++ b/src/llvmopencl/BreakConstantGEPs.cpp
@@ -0,0 +1,326 @@
+//===- BreakConstantGEPs.cpp - Change constant GEPs into GEP instructions - --//
+//
+// pocl note: This pass is taken from The SAFECode project with trivial modifications.
+// Automatic locals might cause constant GEPs which cause problems during
+// converting the locals to kernel function arguments for thread safety.
+//
+// The SAFECode Compiler
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass changes all GEP constant expressions into GEP instructions. This
+// permits the rest of SAFECode to put run-time checks on them if necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "break-constgeps"
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#endif
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/InstIterator.h"
+
+#include "BreakConstantGEPs.h"
+#include "Workgroup.h"
+
+#include <iostream>
+#include <map>
+#include <utility>
+
+// Identifier variable for the pass
+char BreakConstantGEPs::ID = 0;
+
+// Statistics
+STATISTIC (GEPChanges, "Number of Converted GEP Constant Expressions");
+STATISTIC (TotalChanges, "Number of Converted Constant Expressions");
+
+// Register the pass
+static RegisterPass<BreakConstantGEPs> P ("break-constgeps",
+ "Remove GEP Constant Expressions");
+
+//
+// Function: hasConstantGEP()
+//
+// Description:
+// This function determines whether the given value is a constant expression
+// that has a constant GEP expression embedded within it.
+//
+// Inputs:
+// V - The value to check.
+//
+// Return value:
+// NULL - This value is not a constant expression with a constant expression
+// GEP within it.
+// ~NULL - A pointer to the value casted into a ConstantExpr is returned.
+//
+static ConstantExpr *
+hasConstantGEP (Value * V) {
+ if (ConstantExpr * CE = dyn_cast<ConstantExpr>(V)) {
+ if (CE->getOpcode() == Instruction::GetElementPtr ||
+ CE->getOpcode() == Instruction::BitCast)
+ {
+ return CE;
+ } else {
+ for (unsigned index = 0; index < CE->getNumOperands(); ++index) {
+ if (hasConstantGEP (CE->getOperand(index)))
+ return CE;
+ }
+ }
+ }
+
+ return 0;
+}
+
+//
+// Function: convertGEP()
+//
+// Description:
+// Convert a GEP constant expression into a GEP instruction.
+//
+// Inputs:
+// CE - The GEP constant expression.
+// InsertPt - The instruction before which to insert the new GEP instruction.
+//
+// Return value:
+// A pointer to the new GEP instruction is returned.
+//
+static Instruction *
+convertGEP (ConstantExpr * CE, Instruction * InsertPt) {
+ //
+ // Create iterators to the indices of the constant expression.
+ //
+ std::vector<Value *> Indices;
+ for (unsigned index = 1; index < CE->getNumOperands(); ++index) {
+ Indices.push_back (CE->getOperand (index));
+ }
+
+ //
+ // Update the statistics.
+ //
+ ++GEPChanges;
+
+ //
+ // Make the new GEP instruction.
+ //
+ return (GetElementPtrInst::Create (CE->getOperand(0),
+ Indices,
+ CE->getName(),
+ InsertPt));
+}
+
+//
+// Function: convertExpression()
+//
+// Description:
+// Convert a constant expression into an instruction. This routine does *not*
+// perform any recursion, so the resulting instruction may have constant
+// expression operands.
+//
+static Instruction *
+convertExpression (ConstantExpr * CE, Instruction * InsertPt) {
+ //
+ // Convert this constant expression into a regular instruction.
+ //
+ Instruction * NewInst = 0;
+ switch (CE->getOpcode()) {
+ case Instruction::GetElementPtr: {
+ NewInst = convertGEP (CE, InsertPt);
+ break;
+ }
+
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Instruction::BinaryOps Op = (Instruction::BinaryOps)(CE->getOpcode());
+ NewInst = BinaryOperator::Create (Op,
+ CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast: {
+ Instruction::CastOps Op = (Instruction::CastOps)(CE->getOpcode());
+ NewInst = CastInst::Create (Op,
+ CE->getOperand(0),
+ CE->getType(),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction:: FCmp:
+ case Instruction:: ICmp: {
+ Instruction::OtherOps Op = (Instruction::OtherOps)(CE->getOpcode());
+ NewInst = CmpInst::Create (Op,
+ CE->getPredicate(),
+ CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction:: Select:
+ NewInst = SelectInst::Create (CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getOperand(2),
+ CE->getName(),
+ InsertPt);
+ break;
+
+ case Instruction:: ExtractElement:
+ case Instruction:: InsertElement:
+ case Instruction:: ShuffleVector:
+ case Instruction:: InsertValue:
+ default:
+ assert (0 && "Unhandled constant expression!\n");
+ break;
+ }
+
+ //
+ // Update the statistics.
+ //
+ ++TotalChanges;
+
+ return NewInst;
+}
+
+//
+// Method: runOnFunction()
+//
+// Description:
+// Entry point for this LLVM pass.
+//
+// Return value:
+// true - The function was modified.
+// false - The function was not modified.
+//
+bool
+BreakConstantGEPs::runOnFunction (Function & F) {
+
+ if (!pocl::Workgroup::isKernelToProcess(F)) return false;
+
+ bool modified = false;
+
+ // Worklist of values to check for constant GEP expressions
+ std::vector<Instruction *> Worklist;
+
+ //
+ // Initialize the worklist by finding all instructions that have one or more
+ // operands containing a constant GEP expression.
+ //
+ for (Function::iterator BB = F.begin(); BB != F.end(); ++BB) {
+ for (BasicBlock::iterator i = BB->begin(); i != BB->end(); ++i) {
+ //
+ // Scan through the operands of this instruction. If it is a constant
+ // expression GEP, insert an instruction GEP before the instruction.
+ //
+ Instruction * I = i;
+ for (unsigned index = 0; index < I->getNumOperands(); ++index) {
+ if (hasConstantGEP (I->getOperand(index))) {
+ Worklist.push_back (I);
+ }
+ }
+ }
+ }
+
+ //
+ // Determine whether we will modify anything.
+ //
+ if (Worklist.size()) modified = true;
+
+ //
+ // While the worklist is not empty, take an item from it, convert the
+ // operands into instructions if necessary, and determine if the newly
+ // added instructions need to be processed as well.
+ //
+ while (Worklist.size()) {
+ Instruction * I = Worklist.back();
+ Worklist.pop_back();
+
+ //
+ // Scan through the operands of this instruction and convert each into an
+ // instruction. Note that this works a little differently for phi
+ // instructions because the new instruction must be added to the
+ // appropriate predecessor block.
+ //
+ if (PHINode * PHI = dyn_cast<PHINode>(I)) {
+ for (unsigned index = 0; index < PHI->getNumIncomingValues(); ++index) {
+ //
+ // For PHI Nodes, if an operand is a constant expression with a GEP, we
+ // want to insert the new instructions in the predecessor basic block.
+ //
+ // Note: It seems that it's possible for a phi to have the same
+ // incoming basic block listed multiple times; this seems okay as long
+ // the same value is listed for the incoming block.
+ //
+ Instruction * InsertPt = PHI->getIncomingBlock(index)->getTerminator();
+ if (ConstantExpr * CE = hasConstantGEP (PHI->getIncomingValue(index))) {
+ Instruction * NewInst = convertExpression (CE, InsertPt);
+ for (unsigned i2 = index; i2 < PHI->getNumIncomingValues(); ++i2) {
+ if ((PHI->getIncomingBlock (i2)) == PHI->getIncomingBlock (index))
+ PHI->setIncomingValue (i2, NewInst);
+ }
+ Worklist.push_back (NewInst);
+ }
+ }
+ } else {
+ for (unsigned index = 0; index < I->getNumOperands(); ++index) {
+ //
+ // For other instructions, we want to insert instructions replacing
+ // constant expressions immediently before the instruction using the
+ // constant expression.
+ //
+ if (ConstantExpr * CE = hasConstantGEP (I->getOperand(index))) {
+ Instruction * NewInst = convertExpression (CE, I);
+ I->replaceUsesOfWith (CE, NewInst);
+ Worklist.push_back (NewInst);
+ }
+ }
+ }
+ }
+
+ return modified;
+}
+
+
diff --git a/src/llvmopencl/BreakConstantGEPs.h b/src/llvmopencl/BreakConstantGEPs.h
new file mode 100644
index 0000000..4cd86b2
--- /dev/null
+++ b/src/llvmopencl/BreakConstantGEPs.h
@@ -0,0 +1,57 @@
+//===- BreakConstantGEPs.h - Change constant GEPs into GEP instructions --- --//
+//
+// pocl note: This pass is taken from The SAFECode project with trivial modifications.
+// Automatic locals might cause constant GEPs which cause problems during
+// converting the locals to kernel function arguments for thread safety.
+//
+// The SAFECode Compiler
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass changes all GEP constant expressions into GEP instructions. This
+// permits the rest of SAFECode to put run-time checks on them if necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BREAKCONSTANTGEPS_H
+#define BREAKCONSTANTGEPS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+//
+// Pass: BreakConstantGEPs
+//
+// Description:
+// This pass modifies a function so that it uses GEP instructions instead of
+// GEP constant expressions.
+//
+struct BreakConstantGEPs : public FunctionPass {
+ private:
+ // Private methods
+
+ // Private variables
+
+ public:
+ static char ID;
+ BreakConstantGEPs() : FunctionPass(ID) {}
+ const char *getPassName() const {return "Remove Constant GEP Expressions";}
+ virtual bool runOnFunction (Function & F);
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ // This pass does not modify the control-flow graph of the function
+ AU.setPreservesCFG();
+ }
+};
+
+#endif
diff --git a/src/llvmopencl/CanonicalizeBarriers.cc b/src/llvmopencl/CanonicalizeBarriers.cc
new file mode 100644
index 0000000..409e264
--- /dev/null
+++ b/src/llvmopencl/CanonicalizeBarriers.cc
@@ -0,0 +1,214 @@
+// LLVM function pass to canonicalize barriers.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// 2012 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "CanonicalizeBarriers.h"
+#include "BarrierBlock.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <iostream>
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<CanonicalizeBarriers> X("barriers",
+ "Barrier canonicalization pass");
+}
+
+char CanonicalizeBarriers::ID = 0;
+
+void
+CanonicalizeBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+}
+
+bool
+CanonicalizeBarriers::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ BasicBlock *entry = &F.getEntryBlock();
+ if (!isa<BarrierBlock>(entry)) {
+ BasicBlock *effective_entry = SplitBlock(entry,
+ &(entry->front()),
+ this);
+ effective_entry->takeName(entry);
+ entry->setName("entry.barrier");
+ Barrier::Create(entry->getTerminator());
+ }
+
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ BasicBlock *b = i;
+ TerminatorInst *t = b->getTerminator();
+ if ((t->getNumSuccessors() == 0) && (!isa<BarrierBlock>(b))) {
+ /* In case the bb is already terminated with a barrier,
+ split before the barrier so we dot create an empty
+ parallel region.
+
+ This is because the assumptions of the other passes in the
+ compilation that are
+ a) exit node is a barrier block
+ b) there are no empty parallel regions (which would be formed
+ between the explicit barrier and the added one). */
+ BasicBlock *exit;
+ if (Barrier::endsWithBarrier(b))
+ exit = SplitBlock(b, t->getPrevNode(), this);
+ else
+ exit = SplitBlock(b, t, this);
+ exit->setName("exit.barrier");
+ Barrier::Create(t);
+ }
+ }
+
+ DT = getAnalysisIfAvailable<DominatorTree>();
+ LI = getAnalysisIfAvailable<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+
+ if (DT)
+ DT->verifyAnalysis();
+ if (LI)
+ LI->verifyAnalysis();
+
+ return changed;
+}
+
+
+// Canonicalize barriers: ensure all barriers are in a separate BB
+// containing only the barrier and the terminator, with just one
+// predecessor and one successor. This allows us to use
+// those BBs as markers only, they will not be replicated.
+bool
+CanonicalizeBarriers::ProcessFunction(Function &F)
+{
+ bool changed = false;
+
+ InstructionSet Barriers;
+
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ BasicBlock *b = i;
+ for (BasicBlock::iterator i = b->begin(), e = b->end();
+ i != e; ++i)
+ {
+ if (isa<Barrier>(i))
+ {
+ Barriers.insert(i);
+ }
+ }
+ }
+
+ // Finally add all the split points, now that we are done with the
+ // iterators.
+ for (InstructionSet::iterator i = Barriers.begin(), e = Barriers.end();
+ i != e; ++i) {
+ BasicBlock *b = (*i)->getParent();
+
+ // Split post barrier first cause it does not make the barrier
+ // to belong to another basic block.
+ TerminatorInst *t = b->getTerminator();
+ // if ((t->getNumSuccessors() > 1) ||
+ // (t->getPrevNode() != *i)) {
+ // Change: barriers with several successors are all right
+ // they just start several parallel regions. Simplifies
+ // loop handling.
+
+ const bool HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER =
+ t->getPrevNode() != *i;
+
+ if (HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER) {
+ BasicBlock *new_b = SplitBlock(b, (*i)->getNextNode(), this);
+ new_b->setName(b->getName() + ".postbarrier");
+ changed = true;
+ }
+
+ BasicBlock *predecessor = b->getSinglePredecessor();
+ if (predecessor != NULL) {
+ TerminatorInst *pt = predecessor->getTerminator();
+ if ((pt->getNumSuccessors() == 1) &&
+ (&b->front() == (*i))) {
+ // Barrier is at the beginning of the BB,
+ // which has a single predecessor with just
+ // one successor (the barrier itself), thus
+ // no need to split before barrier.
+ continue;
+ }
+ }
+ if ((b == &(b->getParent()->getEntryBlock())) &&
+ (&b->front() == (*i)))
+ continue;
+
+ // If no instructions before barrier, do not split
+ // (allow multiple predecessors, eases loop handling).
+ // if (&b->front() == (*i))
+ // continue;
+ BasicBlock *new_b = SplitBlock(b, *i, this);
+ new_b->takeName(b);
+ b->setName(new_b->getName() + ".prebarrier");
+ changed = true;
+ }
+
+ /* Prune empty regions. That is, if there are two successive
+ barriers, remove the other one. */
+ bool emptyRegionDeleted = false;
+ do {
+ emptyRegionDeleted = false;
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ BasicBlock *b = i;
+ llvm::TerminatorInst *t = b->getTerminator();
+ if (!Barrier::endsWithBarrier(b) || t->getNumSuccessors() != 1) continue;
+
+ BasicBlock *successor = t->getSuccessor(0);
+
+ if (Barrier::hasOnlyBarrier(successor) &&
+ successor->getSinglePredecessor() == b &&
+ successor->getTerminator()->getNumSuccessors() == 1)
+ {
+ b->getTerminator()->setSuccessor(0, successor->getTerminator()->getSuccessor(0));
+ successor->replaceAllUsesWith(b);
+ successor->eraseFromParent();
+ emptyRegionDeleted = true;
+ changed = true;
+ break;
+ }
+ }
+ } while (emptyRegionDeleted);
+
+
+ return changed;
+}
diff --git a/src/llvmopencl/CanonicalizeBarriers.h b/src/llvmopencl/CanonicalizeBarriers.h
new file mode 100644
index 0000000..047db1d
--- /dev/null
+++ b/src/llvmopencl/CanonicalizeBarriers.h
@@ -0,0 +1,56 @@
+// Header for CanonicalizeBarriers.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Pass.h"
+#include <set>
+
+namespace pocl {
+ class Workgroup;
+
+ class CanonicalizeBarriers : public llvm::FunctionPass {
+
+ public:
+ static char ID;
+
+ CanonicalizeBarriers() : FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+ typedef std::set<llvm::Instruction *> InstructionSet;
+
+ llvm::LoopInfo *LI;
+ llvm::DominatorTree *DT;
+
+ bool ProcessFunction(llvm::Function &F);
+
+ friend class pocl::Workgroup;
+ };
+}
diff --git a/src/llvmopencl/Flatten.cc b/src/llvmopencl/Flatten.cc
new file mode 100644
index 0000000..2e01f2a
--- /dev/null
+++ b/src/llvmopencl/Flatten.cc
@@ -0,0 +1,158 @@
+// LLVM module pass to inline required functions (those accessing
+// per-workgroup variables) into the kernel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Flatten.h"
+using namespace pocl;
+
+extern cl::opt<std::string> KernelName;
+
+char Flatten::ID = 0;
+static RegisterPass<Flatten> X("flatten", "Kernel function flattening pass");
+
+static const char *workgroup_variables[] = {
+ "_local_id_x", "_local_id_y", "_local_id_z",
+ "_local_size_x", "_local_size_y", "_local_size_z",
+ "_work_dim",
+ "_num_groups_x", "_num_groups_y", "_num_groups_z",
+ "_group_id_x", "_group_id_y", "_group_id_z",
+ "_global_offset_x", "_global_offset_y", "_global_offset_z",
+ NULL};
+
+//#define DEBUG_FLATTEN
+
+#define INLINE_ALL_NON_KERNEL
+
+#ifdef INLINE_ALL_NON_KERNEL
+
+bool
+Flatten::runOnModule(Module &M)
+{
+ bool changed = false;
+ for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i)
+ {
+ llvm::Function *f = i;
+ if (f->isDeclaration()) continue;
+ if (KernelName == f->getName() ||
+ (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f)))
+ {
+#ifdef LLVM_3_1
+ f->removeFnAttr(Attribute::AlwaysInline);
+ f->addFnAttr(Attribute::NoInline);
+#elif defined LLVM_3_2
+ AttrBuilder b;
+ f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::AlwaysInline)));
+ f->addFnAttr(Attributes::NoInline);
+#else
+ AttributeSet attrs;
+ f->removeAttributes(
+ AttributeSet::FunctionIndex,
+ attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::AlwaysInline));
+
+ f->addFnAttr(Attribute::NoInline);
+#endif
+
+ f->setLinkage(llvm::GlobalValue::ExternalLinkage);
+ changed = true;
+#ifdef DEBUG_FLATTEN
+ std::cerr << "### NoInline for " << f->getName().str() << std::endl;
+#endif
+ }
+ else
+ {
+#ifdef LLVM_3_1
+ f->removeFnAttr(Attribute::NoInline);
+ f->addFnAttr(Attribute::AlwaysInline);
+#elif defined LLVM_3_2
+ AttrBuilder b;
+ f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::NoInline)));
+ f->addFnAttr(Attributes::AlwaysInline);
+#else
+ AttributeSet attrs;
+ f->removeAttributes(
+ AttributeSet::FunctionIndex,
+ attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoInline));
+ f->addFnAttr(Attribute::AlwaysInline);
+#endif
+
+ f->setLinkage(llvm::GlobalValue::InternalLinkage);
+ changed = true;
+#ifdef DEBUG_FLATTEN
+ std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl;
+#endif
+ }
+ }
+ return changed;
+}
+
+#else
+
+bool
+Flatten::runOnModule(Module &M)
+{
+ SmallPtrSet<Function *, 8> functions_to_inline;
+ SmallVector<Value *, 8> pending;
+
+ const char **s = workgroup_variables;
+ while (*s != NULL) {
+ GlobalVariable *gv = M.getGlobalVariable(*s);
+ if (gv != NULL)
+ pending.push_back(gv);
+
+ ++s;
+ }
+
+ while (!pending.empty()) {
+ Value *v = pending.back();
+ pending.pop_back();
+
+ for (Value::use_iterator i = v->use_begin(), e = v->use_end();
+ i != e; ++i) {
+ if (Instruction *ci = dyn_cast<Instruction>(*i)) {
+ // Prevent infinite looping on recursive functions
+ // (though OpenCL does not allow this?)
+ Function *f = ci->getParent()->getParent();;
+ assert((f != NULL) &&
+ "Per-workgroup global variable used on function with no parent!");
+ if (functions_to_inline.count(f))
+ continue;
+
+ functions_to_inline.insert(f);
+ pending.push_back(f);
+ }
+ }
+ }
+
+ for (SmallPtrSet<Function *, 8>::iterator i = functions_to_inline.begin(),
+ e = functions_to_inline.end();
+ i != e; ++i) {
+ (*i)->removeFnAttr(Attribute::NoInline);
+ (*i)->addFnAttr(Attribute::AlwaysInline);
+ }
+
+ return true;
+}
+
+#endif
+
+
diff --git a/src/llvmopencl/Flatten.h b/src/llvmopencl/Flatten.h
new file mode 100644
index 0000000..df3a174
--- /dev/null
+++ b/src/llvmopencl/Flatten.h
@@ -0,0 +1,51 @@
+// LLVM module pass to inline required functions (those accessing
+// per-workgroup variables) into the kernel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <iostream>
+#include <string>
+#include "Workgroup.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Pass.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+
+using namespace llvm;
+
+namespace pocl {
+ class Flatten : public ModulePass {
+
+ public:
+ static char ID;
+ Flatten() : ModulePass(ID) {}
+
+ virtual bool runOnModule(Module &M);
+ };
+
+}
+
diff --git a/src/llvmopencl/GenerateHeader.cc b/src/llvmopencl/GenerateHeader.cc
new file mode 100644
index 0000000..55a5bbe
--- /dev/null
+++ b/src/llvmopencl/GenerateHeader.cc
@@ -0,0 +1,336 @@
+// LLVM module pass to get information from kernel functions.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "pocl.h"
+#include "Workgroup.h"
+#include "llvm/Pass.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#ifdef LLVM_3_1
+#include "llvm/Target/TargetData.h"
+#elif defined LLVM_3_2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Argument.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "LLVMUtils.h"
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+
+cl::opt<string>
+Header("header",
+ cl::desc("Output header file with kernel description macros"),
+ cl::value_desc("header"));
+
+namespace {
+ class GenerateHeader : public ModulePass {
+
+ public:
+ static char ID;
+ GenerateHeader() : ModulePass(ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+ virtual bool runOnModule(Module &M);
+
+ private:
+ void ProcessPointers(Function *F,
+ raw_fd_ostream &out);
+ void ProcessReqdWGSize(Function *F,
+ raw_fd_ostream &out);
+ Function *ProcessAutomaticLocals(Function *F,
+ raw_fd_ostream &out);
+ };
+}
+
+char GenerateHeader::ID = 0;
+static RegisterPass<GenerateHeader> X("generate-header",
+ "Kernel information header creation pass");
+
+void
+GenerateHeader::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DataLayout>();
+}
+
+bool
+GenerateHeader::runOnModule(Module &M)
+{
+ bool changed = false;
+
+ // store the new and old kernel pairs in order to regenerate
+ // all the metadata that used to point to the unmodified
+ // kernels
+ FunctionMapping kernels;
+
+ string ErrorInfo;
+ raw_fd_ostream out(Header.c_str(), ErrorInfo, raw_fd_ostream::F_Append);
+
+ for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
+ if (!Workgroup::isKernelToProcess(*mi))
+ continue;
+
+ Function *F = mi;
+
+ ProcessPointers(F, out);
+ ProcessReqdWGSize(F, out);
+
+ Function *new_kernel = ProcessAutomaticLocals(F, out);
+ if (new_kernel != F)
+ changed = true;
+ kernels[F] = new_kernel;
+ }
+
+ if (changed)
+ {
+ regenerate_kernel_metadata(M, kernels);
+
+ /* Delete the old kernels. */
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end(); i != e; ++i)
+ {
+ Function *old_kernel = (*i).first;
+ Function *new_kernel = (*i).second;
+ if (old_kernel == new_kernel) continue;
+ old_kernel->eraseFromParent();
+ }
+ }
+ return changed;
+}
+
+#include <iostream>
+
+void
+GenerateHeader::ProcessReqdWGSize(Function *F,
+ raw_fd_ostream &out)
+{
+
+ unsigned LocalSizeX = 0, LocalSizeY = 0, LocalSizeZ = 0;
+
+ llvm::NamedMDNode *size_info = F->getParent()->getNamedMetadata("opencl.kernel_wg_size_info");
+ if (size_info) {
+ for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
+ llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
+ if (KernelSizeInfo->getOperand(0) == F) {
+ LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
+ LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
+ LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
+ }
+ }
+ }
+
+ out << "#define _" << F->getName() << "_REQD_WG_SIZE {"
+ << LocalSizeX << ", "
+ << LocalSizeY << ", "
+ << LocalSizeZ << "}\n";
+}
+
+
+void
+GenerateHeader::ProcessPointers(Function *F,
+ raw_fd_ostream &out)
+{
+ int num_args = F->getFunctionType()->getNumParams();
+
+ out << "#define _" << F->getName() << "_NUM_ARGS " << num_args << '\n';
+
+ bool is_pointer[num_args];
+ bool is_local[num_args];
+ bool is_image[num_args];
+ bool is_sampler[num_args];
+
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(),
+ ee = F->arg_end();
+ ii != ee; ++ii) {
+ Type *t = ii->getType();
+
+ is_image[i] = false;
+ is_sampler[i] = false;
+
+ const PointerType *p = dyn_cast<PointerType>(t);
+ if (p && !ii->hasByValAttr()) {
+ is_pointer[i] = true;
+ // index 0 is for function attributes, parameters start at 1.
+ if (p->getAddressSpace() == POCL_ADDRESS_SPACE_GLOBAL ||
+ p->getAddressSpace() == POCL_ADDRESS_SPACE_CONSTANT)
+ is_local[i] = false;
+ else
+ is_local[i] = true;
+ } else {
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+
+ if (t->isPointerTy()) {
+ if (t->getPointerElementType()->isStructTy()) {
+ string name = t->getPointerElementType()->getStructName().str();
+ if (name == "struct.image2d_t_") { // TODO image3d?
+ is_image[i] = true;
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+ if (name == "struct.sampler_t_") {
+ is_sampler[i] = true;
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+ }
+ }
+
+ ++i;
+ }
+
+ out << "#define _" << F->getName() << "_ARG_IS_POINTER {";
+ if (num_args != 0) {
+ out << is_pointer[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_pointer[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_LOCAL {";
+ if (num_args != 0) {
+ out << is_local[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_local[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_IMAGE {";
+ if (num_args != 0) {
+ out << is_image[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_image[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_SAMPLER {";
+ if (num_args != 0) {
+ out << is_sampler[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_sampler[i];
+ }
+ out << "}\n";
+}
+
+
+Function *
+GenerateHeader::ProcessAutomaticLocals(Function *F,
+ raw_fd_ostream &out)
+{
+ Module *M = F->getParent();
+ DataLayout &TD = getAnalysis<DataLayout>();
+
+ SmallVector<GlobalVariable *, 8> locals;
+
+ SmallVector<Type *, 8> parameters;
+ for (Function::const_arg_iterator i = F->arg_begin(),
+ e = F->arg_end();
+ i != e; ++i)
+ parameters.push_back(i->getType());
+
+ for (Module::global_iterator i = M->global_begin(),
+ e = M->global_end();
+ i != e; ++i) {
+ std::string funcName = "";
+ funcName = F->getName().str();
+ if (i->getName().startswith(funcName + ".")) {
+ // Additional checks might be needed here. For now
+ // we assume any global starting with kernel name
+ // is declaring a local variable.
+ locals.push_back(i);
+ // Add the parameters to the end of the function parameter list.
+ parameters.push_back(i->getType());
+ }
+ }
+
+ out << "#define _" << F->getName() << "_NUM_LOCALS "<< locals.size() << "\n";
+ out << "#define _" << F->getName() << "_LOCAL_SIZE {";
+ if (!locals.empty()) {
+ out << TD.getTypeAllocSize(locals[0]->getInitializer()->getType());
+ for (unsigned i = 1; i < locals.size(); ++i)
+ out << ", " << TD.getTypeAllocSize(locals[i]->getInitializer()->getType());
+ }
+ out << "}\n";
+
+ if (locals.empty()) {
+ // This kernel fingerprint has not changed.
+ return F;
+ }
+
+ // Create the new function.
+ FunctionType *ft = FunctionType::get(F->getReturnType(),
+ parameters,
+ F->isVarArg());
+ Function *new_kernel = Function::Create(ft,
+ F->getLinkage(),
+ "",
+ M);
+ new_kernel->takeName(F);
+
+ ValueToValueMapTy vv;
+ Function::arg_iterator j = new_kernel->arg_begin();
+ for (Function::const_arg_iterator i = F->arg_begin(),
+ e = F->arg_end();
+ i != e; ++i) {
+ j->setName(i->getName());
+ vv[i] = j;
+ ++j;
+ }
+
+ for (int i = 0; j != new_kernel->arg_end(); ++i, ++j) {
+ j->setName("_local" + Twine(i));
+ vv[locals[i]] = j;
+ }
+
+ SmallVector<ReturnInst *, 1> ri;
+ CloneFunctionInto(new_kernel, F, vv, false, ri);
+
+ return new_kernel;
+}
+
diff --git a/src/llvmopencl/ImplicitLoopBarriers.cc b/src/llvmopencl/ImplicitLoopBarriers.cc
new file mode 100644
index 0000000..66dcdb3
--- /dev/null
+++ b/src/llvmopencl/ImplicitLoopBarriers.cc
@@ -0,0 +1,178 @@
+// LLVM function pass that adds implicit barriers to loops if it sees
+// beneficial.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "ImplicitLoopBarriers.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "VariableUniformityAnalysis.h"
+
+#include <iostream>
+
+//#define DEBUG_ILOOP_BARRIERS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<ImplicitLoopBarriers> X("implicit-loop-barriers",
+ "Adds implicit barriers to loops");
+}
+
+char ImplicitLoopBarriers::ID = 0;
+
+void
+ImplicitLoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+ AU.addRequired<VariableUniformityAnalysis>();
+ AU.addPreserved<VariableUniformityAnalysis>();
+}
+
+bool
+ImplicitLoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM)
+{
+ if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent()))
+ return false;
+
+ return ProcessLoop(L, LPM);
+}
+
+
+/**
+ * Adds a barrier to the first BB of each loop.
+ *
+ * Note: it's not safe to do this in case the loop is not executed
+ * by all work items. Therefore this is not enabled by default.
+ */
+bool
+ImplicitLoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM)
+{
+
+ bool isBLoop = false;
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && !isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+ isBLoop = true;
+ break;
+ }
+ }
+ }
+ if (isBLoop) return false;
+
+ return AddInnerLoopBarrier(L, LPM);
+}
+
+/**
+ * Adds a barrier to the beginning of the loop body to force its treatment
+ * similarly to a loop with work-group barriers.
+ *
+ * This allows parallelizing work-items across the work-group per kernel
+ * for-loop iteration, potentially leading to easier horizontal vectorization.
+ * The idea is similar to loop switching where the work-item loop is
+ * switched with the kernel for-loop.
+ *
+ * We need to make sure it is legal to add the barrier, though. The
+ * OpenCL barrier semantics require either all or none of the WIs to
+ * reach the barrier at each iteration. This is satisfied at least when
+ *
+ * a) loop exit condition does not depend on the WI and
+ * b) all or none of the WIs always enter the loop
+ */
+bool
+ImplicitLoopBarriers::AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM) {
+
+ /* Only add barriers to the innermost loops. */
+
+ if (L->getSubLoops().size() > 0)
+ return false;
+
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### trying to add a loop barrier to force horizontal parallelization"
+ << std::endl;
+#endif
+
+ BasicBlock *brexit = L->getExitingBlock();
+ if (brexit == NULL) return false; /* Multiple exit points */
+
+ llvm::BasicBlock *loopEntry = L->getHeader();
+ if (loopEntry == NULL) return false; /* Multiple entries blocks? */
+
+ llvm::Function *f = brexit->getParent();
+
+ VariableUniformityAnalysis &VUA =
+ getAnalysis<VariableUniformityAnalysis>();
+
+ /* Check if the whole loop construct is executed by all or none of the
+ work-items. */
+ if (!VUA.isUniform(f, loopEntry)) {
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### the loop is not uniform because loop entry '"
+ << loopEntry->getName().str() << "' is not uniform" << std::endl;
+
+#endif
+ return false;
+ }
+
+ /* Check the branch condition predicate. If it is uniform, we know the loop
+ is executed the same number of times for all WIs. */
+ llvm::BranchInst *br = dyn_cast<llvm::BranchInst>(brexit->getTerminator());
+ if (br && br->isConditional() &&
+ VUA.isUniform(f, br->getCondition())) {
+
+ Barrier::Create(brexit->getTerminator());
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### added an inner-loop barrier to the loop" << std::endl << std::endl;
+#endif
+ return true;
+ } else {
+#ifdef DEBUG_ILOOP_BARRIERS
+ if (br && br->isConditional() && !VUA.isUniform(f, br->getCondition())) {
+ std::cerr << "### loop condition not uniform" << std::endl;
+ br->getCondition()->dump();
+ }
+#endif
+
+ }
+
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### cannot add an inner-loop barrier to the loop" << std::endl << std::endl;
+#endif
+
+ return false;
+}
diff --git a/src/llvmopencl/ImplicitLoopBarriers.h b/src/llvmopencl/ImplicitLoopBarriers.h
new file mode 100644
index 0000000..e31a134
--- /dev/null
+++ b/src/llvmopencl/ImplicitLoopBarriers.h
@@ -0,0 +1,44 @@
+// Header for ImplicitLoopBarriers loop pass.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "llvm/Analysis/LoopPass.h"
+#include <set>
+
+namespace pocl {
+ class ImplicitLoopBarriers : public llvm::LoopPass {
+
+ public:
+ static char ID;
+
+ ImplicitLoopBarriers() : LoopPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ private:
+ llvm::DominatorTree *DT;
+
+ bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+ bool AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ };
+}
diff --git a/src/llvmopencl/IsolateRegions.cc b/src/llvmopencl/IsolateRegions.cc
new file mode 100644
index 0000000..b370aa4
--- /dev/null
+++ b/src/llvmopencl/IsolateRegions.cc
@@ -0,0 +1,175 @@
+// Header for IsolateRegions RegionPass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "IsolateRegions.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "config.h"
+
+#include <iostream>
+
+//#define DEBUG_ISOLATE_REGIONS
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<IsolateRegions> X("isolate-regions",
+ "Single-Entry Single-Exit region isolation pass.");
+}
+
+char IsolateRegions::ID = 0;
+
+void
+IsolateRegions::getAnalysisUsage(AnalysisUsage &AU) const
+{
+}
+
+/* Ensure Single-Entry Single-Exit Regions are isolated from the
+ exit node so they won't get split illegally with tail replication.
+
+ This might happen in case an if .. else .. structure is just
+ before an exit from kernel. Both branches are split even though
+ we would like to replicate the structure as a whole to retain
+ semantics. This adds dummy basic blocks to all Regions just for
+ clarity. Cleanup with -simplifycfg.
+
+ TODO: Also add a dummy BB in case the Region starts with a
+ barrier. Such a Region might not get optimally replicated and
+ can lead to problematic cases. E.g.:
+
+ digraph G {
+ BAR1 -> A;
+ A -> X;
+ BAR1 -> X;
+ X -> BAR2;
+ }
+
+ (draw with "dot -Tpng -o graph.png" + copy paste the above)
+
+ Here you have a structure which should be replicated fully but
+ it won't as the Region starts with a barrier at a split point
+ BB, thus it tries to replicate both of the branches which lead
+ to interesting errors and is not supported. Another option would
+ be to tail replicate both of the branches, but currently tail
+ replication is done only starting from the exit nodes.
+
+ IsolateRegions "normalizes" the graph to:
+
+ digraph G {
+ BAR1 -> r_entry;
+ r_entry -> A;
+ A -> X;
+ r_entry -> X;
+ X -> BAR2;
+ }
+
+
+*/
+bool
+IsolateRegions::runOnRegion(Region *R, llvm::RGPassManager&)
+{
+ llvm::BasicBlock *exit = R->getExit();
+ if (exit == NULL) return false;
+
+#ifdef DEBUG_ISOLATE_REGIONS
+ std::cerr << "### processing region:" << std::endl;
+ R->dump();
+ std::cerr << "### exit block:" << std::endl;
+ exit->dump();
+#endif
+ bool isFunctionExit = exit->getTerminator()->getNumSuccessors() == 0;
+
+ bool changed = false;
+
+ if (Barrier::hasBarrier(exit) || isFunctionExit)
+ {
+ addDummyBefore(R, exit);
+ changed = true;
+ }
+
+ llvm::BasicBlock *entry = R->getEntry();
+ if (entry == NULL) return changed;
+
+ bool isFunctionEntry = &entry->getParent()->getEntryBlock() == entry;
+
+ if (Barrier::hasBarrier(entry) || isFunctionEntry)
+ {
+ addDummyAfter(R, entry);
+ changed = true;
+ }
+
+ return changed;
+}
+
+
+/**
+ * Adds a dummy node after the given basic block.
+ */
+void
+IsolateRegions::addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb)
+{
+ std::vector< llvm::BasicBlock* > regionSuccs;
+
+ for (llvm::succ_iterator i = succ_begin(bb), e = succ_end(bb);
+ i != e; ++i) {
+ llvm::BasicBlock* succ = *i;
+ if (R->contains(succ))
+ regionSuccs.push_back(succ);
+ }
+ llvm::BasicBlock* newEntry =
+ SplitBlock(bb, bb->getTerminator(), this);
+ newEntry->setName(bb->getName() + ".r_entry");
+ R->replaceEntry(newEntry);
+
+}
+
+/**
+ * Adds a dummy node before the given basic block.
+ *
+ * The edges going in to the original BB are moved to go
+ * in to the dummy BB in case the source BB is inside the
+ * same region.
+ */
+void
+IsolateRegions::addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb)
+{
+ std::vector< llvm::BasicBlock* > regionPreds;
+
+ for (pred_iterator i = pred_begin(bb), e = pred_end(bb);
+ i != e; ++i) {
+ llvm::BasicBlock* pred = *i;
+ if (R->contains(pred))
+ regionPreds.push_back(pred);
+ }
+#ifdef LLVM_3_0
+ llvm::BasicBlock* newExit =
+ SplitBlockPredecessors
+ (bb, &regionPreds[0], regionPreds.size(), ".r_exit", this);
+#else
+ llvm::BasicBlock* newExit =
+ SplitBlockPredecessors(bb, regionPreds, ".r_exit", this);
+#endif
+ R->replaceExit(newExit);
+}
diff --git a/src/llvmopencl/IsolateRegions.h b/src/llvmopencl/IsolateRegions.h
new file mode 100644
index 0000000..62f6a29
--- /dev/null
+++ b/src/llvmopencl/IsolateRegions.h
@@ -0,0 +1,44 @@
+// Header for IsolateRegions RegionPass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_ISOLATE_REGIONS_H
+#define POCL_ISOLATE_REGIONS_H
+
+#include "llvm/Analysis/RegionPass.h"
+
+namespace pocl {
+
+ class IsolateRegions : public llvm::RegionPass {
+ public:
+ static char ID;
+
+ IsolateRegions() : RegionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnRegion(llvm::Region *R, llvm::RGPassManager&);
+ void addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb);
+ void addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Kernel.cc b/src/llvmopencl/Kernel.cc
new file mode 100644
index 0000000..03e08b8
--- /dev/null
+++ b/src/llvmopencl/Kernel.cc
@@ -0,0 +1,297 @@
+// Class for kernels, llvm::Functions that represent OpenCL C kernels.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Kernel.h"
+#include "Barrier.h"
+#include <iostream>
+
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif
+
+//#define DEBUG_PR_CREATION
+
+using namespace llvm;
+using namespace pocl;
+
+static void add_predecessors(SmallVectorImpl<BasicBlock *> &v,
+ BasicBlock *b);
+static bool verify_no_barriers(const BasicBlock *B);
+
+void
+Kernel::getExitBlocks(SmallVectorImpl<BarrierBlock *> &B)
+{
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ const TerminatorInst *t = i->getTerminator();
+ if (t->getNumSuccessors() == 0) {
+ // All exits must be barrier blocks.
+ B.push_back(cast<BarrierBlock>(i));
+ }
+ }
+}
+
+ParallelRegion *
+Kernel::createParallelRegionBefore(BarrierBlock *B)
+{
+ SmallVector<BasicBlock *, 4> pending_blocks;
+ SmallPtrSet<BasicBlock *, 8> blocks_in_region;
+ BarrierBlock *region_entry_barrier = NULL;
+ llvm::BasicBlock *entry = NULL;
+ llvm::BasicBlock *exit = B->getSinglePredecessor();
+ add_predecessors(pending_blocks, B);
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "createParallelRegionBefore " << B->getName().str() << std::endl;
+#endif
+
+ while (!pending_blocks.empty()) {
+ BasicBlock *current = pending_blocks.back();
+ pending_blocks.pop_back();
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "considering " << current->getName().str() << std::endl;
+#endif
+
+ // avoid infinite recursion of loops
+ if (blocks_in_region.count(current) != 0)
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "already in the region!" << std::endl;
+#endif
+ continue;
+ }
+
+ // If we reach another barrier this must be the
+ // parallel region entry.
+ if (isa<BarrierBlock>(current)) {
+ if (region_entry_barrier == NULL)
+ region_entry_barrier = cast<BarrierBlock>(current);
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "### it's a barrier!" << std::endl;
+#endif
+ continue;
+ }
+
+
+ if (!verify_no_barriers(current))
+ {
+ assert(verify_no_barriers(current) &&
+ "Barrier found in a non-barrier block! (forgot barrier canonicalization?)");
+ }
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "added it to the region" << std::endl;
+#endif
+ // Non-barrier block, this must be on the region.
+ blocks_in_region.insert(current);
+
+ // Add predecessors to pending queue.
+ add_predecessors(pending_blocks, current);
+ }
+
+ if (blocks_in_region.empty())
+ return NULL;
+
+ // Find the entry node.
+ assert (region_entry_barrier != NULL);
+ for (unsigned suc = 0, num = region_entry_barrier->getTerminator()->getNumSuccessors();
+ suc < num; ++suc)
+ {
+ llvm::BasicBlock *entryCandidate =
+ region_entry_barrier->getTerminator()->getSuccessor(suc);
+ if (blocks_in_region.count(entryCandidate) == 0)
+ continue;
+ entry = entryCandidate;
+ break;
+ }
+ assert (blocks_in_region.count(entry) != 0);
+
+ // We got all the blocks in a region, create it.
+ return ParallelRegion::Create(blocks_in_region, entry, exit);
+}
+
+static void
+add_predecessors(SmallVectorImpl<BasicBlock *> &v, BasicBlock *b)
+{
+ for (pred_iterator i = pred_begin(b), e = pred_end(b);
+ i != e; ++i) {
+ if ((isa<BarrierBlock> (*i)) && isa<BarrierBlock> (b)) {
+ // Ignore barrier-to-barrier edges * Why? --Pekka
+ add_predecessors(v, *i);
+ continue;
+ }
+ v.push_back(*i);
+ }
+}
+
+static bool
+verify_no_barriers(const BasicBlock *B)
+{
+ for (BasicBlock::const_iterator i = B->begin(), e = B->end(); i != e; ++i) {
+ if (isa<Barrier>(i))
+ return false;
+ }
+
+ return true;
+}
+
+ParallelRegion::ParallelRegionVector *
+Kernel::getParallelRegions(llvm::LoopInfo *LI) {
+ ParallelRegion::ParallelRegionVector *parallel_regions =
+ new ParallelRegion::ParallelRegionVector;
+
+ SmallVector<BarrierBlock *, 4> exit_blocks;
+ getExitBlocks(exit_blocks);
+
+ // We need to keep track of traversed barriers to detect back edges.
+ SmallPtrSet<BarrierBlock *, 8> found_barriers;
+
+ // First find all the ParallelRegions in the Function.
+ while (!exit_blocks.empty()) {
+
+ // We start on an exit block and process the parallel regions upwards
+ // (finding an execution trace).
+ BarrierBlock *exit = exit_blocks.back();
+ exit_blocks.pop_back();
+
+ while (ParallelRegion *PR = createParallelRegionBefore(exit)) {
+ assert(PR != NULL && !PR->empty() &&
+ "Empty parallel region in kernel (contiguous barriers)!");
+
+ found_barriers.insert(exit);
+ exit = NULL;
+ parallel_regions->push_back(PR);
+ BasicBlock *entry = PR->entryBB();
+ int found_predecessors = 0;
+ BarrierBlock *loop_barrier = NULL;
+ for (pred_iterator i = pred_begin(entry), e = pred_end(entry);
+ i != e; ++i) {
+ BarrierBlock *barrier = cast<BarrierBlock> (*i);
+ if (!found_barriers.count(barrier)) {
+ /* If this is a loop header block we might have edges from two
+ unprocessed barriers. The one inside the loop (coming from a
+ computation block after a branch block) should be processed
+ first. */
+ std::string bbName = "";
+ const bool IS_IN_THE_SAME_LOOP =
+ LI->getLoopFor(barrier) != NULL &&
+ LI->getLoopFor(entry) != NULL &&
+ LI->getLoopFor(entry) == LI->getLoopFor(barrier);
+
+ if (IS_IN_THE_SAME_LOOP)
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### found a barrier inside the loop:" << std::endl;
+ std::cout << barrier->getName().str() << std::endl;
+#endif
+ if (loop_barrier != NULL) {
+ // there can be multiple latches and each have their barrier,
+ // save the previously found inner loop barrier
+ exit_blocks.push_back(loop_barrier);
+ }
+ loop_barrier = barrier;
+ }
+ else
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### found a barrier:" << std::endl;
+ std::cout << barrier->getName().str() << std::endl;
+#endif
+ exit = barrier;
+ }
+ ++found_predecessors;
+ }
+ }
+
+ if (loop_barrier != NULL)
+ {
+ /* The secondary barrier to process in case it was a loop
+ header. Push it for later processing. */
+ if (exit != NULL)
+ exit_blocks.push_back(exit);
+ /* always process the inner loop regions first */
+ if (!found_barriers.count(loop_barrier))
+ exit = loop_barrier;
+ }
+
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### created a ParallelRegion:" << std::endl;
+ PR->dumpNames();
+ std::cout << std::endl;
+#endif
+
+ if (found_predecessors == 0)
+ {
+ /* This path has been traversed and we encountered no more
+ unprocessed regions. It means we have either traversed all
+ paths from the exit or have transformed a loop and thus
+ encountered only a barrier that was seen (and thus
+ processed) before. */
+ break;
+ }
+ assert ((exit != NULL) && "Parallel region without entry barrier!");
+ }
+ }
+ return parallel_regions;
+
+}
+
+void
+Kernel::addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ) {
+
+ IRBuilder<> builder(getEntryBlock().getFirstNonPHI());
+
+ GlobalVariable *gv;
+
+ llvm::Module* M = getParent();
+
+ int size_t_width = 32;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*Params=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *localsize =
+ dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft));
+ gv = M->getGlobalVariable("_local_size_x");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 0)),
+ gv);
+ gv = M->getGlobalVariable("_local_size_y");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 1)),
+ gv);
+ gv = M->getGlobalVariable("_local_size_z");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 2)),
+ gv);
+}
+
diff --git a/src/llvmopencl/Kernel.h b/src/llvmopencl/Kernel.h
new file mode 100644
index 0000000..5337b54
--- /dev/null
+++ b/src/llvmopencl/Kernel.h
@@ -0,0 +1,54 @@
+// Class for kernels, a special kind of function.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_KERNEL_H
+#define _POCL_KERNEL_H
+
+#include "ParallelRegion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+namespace pocl {
+
+ class Kernel : public llvm::Function {
+ public:
+ void getExitBlocks(llvm::SmallVectorImpl<BarrierBlock *> &B);
+ ParallelRegion *createParallelRegionBefore(BarrierBlock *B);
+
+ ParallelRegion::ParallelRegionVector*
+ getParallelRegions(llvm::LoopInfo *LI);
+
+ void addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ);
+
+ static bool isKernel(const llvm::Function &F);
+
+ static bool classof(const Kernel *) { return true; }
+ // We assume any function can be a kernel. This could be used
+ // to check for metadata (but would need to be overrideable somehow
+ // to honor the forced kernel name(s) parameter in command line.
+ static bool classof(const llvm::Function *) { return true; }
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/LLVMUtils.cc b/src/llvmopencl/LLVMUtils.cc
new file mode 100644
index 0000000..aeb02d7
--- /dev/null
+++ b/src/llvmopencl/LLVMUtils.cc
@@ -0,0 +1,90 @@
+// Implementation of LLVMUtils, useful common LLVM-related functionality.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "LLVMUtils.h"
+
+#include "config.h"
+
+#ifdef LLVM_3_2
+#include <llvm/Module.h>
+#include <llvm/Metadata.h>
+#else
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Metadata.h>
+#endif
+
+using namespace llvm;
+
+/**
+ * Regenerates the metadata that points to the original kernel
+ * (of which finger print was modified) to point to the new
+ * kernel.
+ *
+ * Only checks if the first operand of the metadata is the kernel
+ * function.
+ */
+void
+regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels)
+{
+ // reproduce the opencl.kernel_wg_size_info metadata
+ NamedMDNode *wg_sizes = M.getNamedMetadata("opencl.kernel_wg_size_info");
+ if (wg_sizes != NULL && wg_sizes->getNumOperands() > 0)
+ {
+ for (std::size_t mni = 0; mni < wg_sizes->getNumOperands(); ++mni)
+ {
+ MDNode *wgsizeMD = dyn_cast<MDNode>(wg_sizes->getOperand(mni));
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end(); i != e; ++i)
+ {
+ Function *old_kernel = (*i).first;
+ Function *new_kernel = (*i).second;
+ if (old_kernel == new_kernel || wgsizeMD->getNumOperands() == 0 ||
+ dyn_cast<Function>(wgsizeMD->getOperand(0)) != old_kernel)
+ continue;
+ // found a wg size metadata that points to the old kernel, copy its
+ // operands except the first one to a new MDNode
+ SmallVector<Value*, 8> operands;
+ operands.push_back(new_kernel);
+ for (unsigned opr = 1; opr < wgsizeMD->getNumOperands(); ++opr)
+ {
+ operands.push_back(wgsizeMD->getOperand(opr));
+ }
+ MDNode *new_wg_md = MDNode::get(M.getContext(), operands);
+ wg_sizes->addOperand(new_wg_md);
+ }
+ }
+ }
+
+ // reproduce the opencl.kernels metadata
+ NamedMDNode *nmd = M.getNamedMetadata("opencl.kernels");
+ if (nmd)
+ M.eraseNamedMetadata(nmd);
+
+ nmd = M.getOrInsertNamedMetadata("opencl.kernels");
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end();
+ i != e; ++i) {
+ MDNode *md = MDNode::get(M.getContext(), ArrayRef<Value *>((*i).second));
+ nmd->addOperand(md);
+ }
+}
+
diff --git a/src/llvmopencl/LLVMUtils.h b/src/llvmopencl/LLVMUtils.h
new file mode 100644
index 0000000..e6a89db
--- /dev/null
+++ b/src/llvmopencl/LLVMUtils.h
@@ -0,0 +1,38 @@
+// Header for LLVMUtils, useful common LLVM-related functionality.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_LLVM_UTILS_H
+#define _POCL_LLVM_UTILS_H
+
+#include <map>
+
+namespace llvm {
+ class Module;
+ class Function;
+}
+
+typedef std::map<llvm::Function*, llvm::Function*> FunctionMapping;
+
+void
+regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels);
+
+#endif
diff --git a/src/llvmopencl/LoopBarriers.cc b/src/llvmopencl/LoopBarriers.cc
new file mode 100644
index 0000000..5e4965f
--- /dev/null
+++ b/src/llvmopencl/LoopBarriers.cc
@@ -0,0 +1,194 @@
+// LLVM loop pass that adds required barriers to loops.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// 2012-2014 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <iostream>
+
+#include "LoopBarriers.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+
+//#define DEBUG_LOOP_BARRIERS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<LoopBarriers> X("loop-barriers",
+ "Add needed barriers to loops");
+}
+
+char LoopBarriers::ID = 0;
+
+void
+LoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+}
+
+bool
+LoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM)
+{
+ if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent()))
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+
+ bool changed = ProcessLoop(L, LPM);
+
+ DT->verifyAnalysis();
+
+ return changed;
+}
+
+
+bool
+LoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM)
+{
+ bool isBLoop = false;
+ bool changed = false;
+
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && !isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+ isBLoop = true;
+ break;
+ }
+ }
+ }
+
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+
+ // Found a barrier in this loop:
+ // 1) add a barrier in the loop header.
+ // 2) add a barrier in the latches
+
+ // Add a barrier on the preheader to ensure all WIs reach
+ // the loop header with all the previous code already
+ // executed.
+ BasicBlock *preheader = L->getLoopPreheader();
+ assert((preheader != NULL) && "Non-canonicalized loop found!\n");
+#ifdef DEBUG_LOOP_BARRIERS
+ std::cerr << "### adding to preheader BB" << std::endl;
+ preheader->dump();
+ std::cerr << "### before instr" << std::endl;
+ preheader->getTerminator()->dump();
+#endif
+ Barrier::Create(preheader->getTerminator());
+ preheader->setName(preheader->getName() + ".loopbarrier");
+
+ // Add a barrier after the PHI nodes on the header (the replicated
+ // headers will be merged afterwards).
+ BasicBlock *header = L->getHeader();
+ if (header->getFirstNonPHI() != &header->front()) {
+ Barrier::Create(header->getFirstNonPHI());
+ header->setName(header->getName() + ".phibarrier");
+ // Split the block to create a replicable region of
+ // the loop contents in case the phi node contains a
+ // branch (that can be to inside the region).
+ // if (header->getTerminator()->getNumSuccessors() > 1)
+ // SplitBlock(header, header->getTerminator(), this);
+ }
+
+ // Add the barriers on the exiting block and the latches,
+ // which might not always be the same if there is computation
+ // after the exit decision.
+ BasicBlock *brexit = L->getExitingBlock();
+ if (brexit != NULL) {
+ Barrier::Create(brexit->getTerminator());
+ brexit->setName(brexit->getName() + ".brexitbarrier");
+ }
+
+ BasicBlock *latch = L->getLoopLatch();
+ if (latch != NULL && brexit != latch) {
+ // This loop has only one latch. Do not check for dominance, we
+ // are probably running before BTR.
+ Barrier::Create(latch->getTerminator());
+ latch->setName(latch->getName() + ".latchbarrier");
+ return changed;
+ }
+
+ // Modified code from llvm::LoopBase::getLoopLatch to
+ // go trough all the latches.
+ BasicBlock *Header = L->getHeader();
+ typedef GraphTraits<Inverse<BasicBlock *> > InvBlockTraits;
+ InvBlockTraits::ChildIteratorType PI = InvBlockTraits::child_begin(Header);
+ InvBlockTraits::ChildIteratorType PE = InvBlockTraits::child_end(Header);
+ BasicBlock *Latch = NULL;
+ for (; PI != PE; ++PI) {
+ InvBlockTraits::NodeType *N = *PI;
+ if (L->contains(N)) {
+ Latch = N;
+ // Latch found in the loop, see if the barrier dominates it
+ // (otherwise if might no even belong to this "tail", see
+ // forifbarrier1 graph test).
+ if (DT->dominates(j->getParent(), Latch)) {
+ Barrier::Create(Latch->getTerminator());
+ Latch->setName(Latch->getName() + ".latchbarrier");
+ }
+ }
+ }
+ return true;
+ }
+ }
+ }
+
+ /* This is a loop without a barrier. Ensure we have a non-barrier
+ block as a preheader so we can replicate the loop as a whole.
+
+ If the block has proper instructions after the barrier, it
+ will be split in CanonicalizeBarriers. */
+ BasicBlock *preheader = L->getLoopPreheader();
+ assert((preheader != NULL) && "Non-canonicalized loop found!\n");
+ TerminatorInst *t = preheader->getTerminator();
+ Instruction *prev = NULL;
+ if (&preheader->front() != t)
+ prev = t->getPrevNode();
+ if (prev && isa<Barrier>(prev))
+ {
+ BasicBlock *new_b = SplitBlock(preheader, t, this);
+ new_b->setName(preheader->getName() + ".postbarrier_dummy");
+ return true;
+ }
+
+ return changed;
+}
+
diff --git a/src/llvmopencl/LoopBarriers.h b/src/llvmopencl/LoopBarriers.h
new file mode 100644
index 0000000..6d80de6
--- /dev/null
+++ b/src/llvmopencl/LoopBarriers.h
@@ -0,0 +1,47 @@
+// Header for LoopBarriers.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_LOOP_BARRIERS_H
+#define POCL_LOOP_BARRIERS_H
+
+#include "llvm/Analysis/LoopPass.h"
+#include <set>
+
+namespace pocl {
+ class LoopBarriers : public llvm::LoopPass {
+
+ public:
+ static char ID;
+
+ LoopBarriers() : LoopPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ private:
+ llvm::DominatorTree *DT;
+
+ bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Makefile.am b/src/llvmopencl/Makefile.am
new file mode 100644
index 0000000..881a35c
--- /dev/null
+++ b/src/llvmopencl/Makefile.am
@@ -0,0 +1,53 @@
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/llvmopencl.
+#
+# Copyright (c) 2011 Universidad Rey Juan Carlos
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+pkglib_LTLIBRARIES = llvmopencl.la
+
+AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags`
+AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags`
+llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION}
+
+llvmopencl_la_SOURCES = Barrier.h \
+ BarrierBlock.h BarrierBlock.cc \
+ Kernel.h Kernel.cc \
+ ParallelRegion.h ParallelRegion.cc \
+ CanonicalizeBarriers.h CanonicalizeBarriers.cc \
+ LoopBarriers.h LoopBarriers.cc \
+ GenerateHeader.cc Workgroup.h Workgroup.cc \
+ BarrierTailReplication.h BarrierTailReplication.cc \
+ Flatten.cc IsolateRegions.h IsolateRegions.cc \
+ WorkitemReplication.h WorkitemReplication.cc \
+ ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \
+ WorkItemAliasAnalysis.cc WIVectorize.cc \
+ WorkitemHandler.h WorkitemHandler.cc \
+ WorkitemLoops.h WorkitemLoops.cc \
+ PHIsToAllocas.h PHIsToAllocas.cc \
+ BreakConstantGEPs.h BreakConstantGEPs.cpp \
+ WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \
+ AllocasToEntry.h AllocasToEntry.cc \
+ TargetAddressSpaces.h TargetAddressSpaces.cc \
+ LLVMUtils.cc LLVMUtils.h \
+ VariableUniformityAnalysis.h VariableUniformityAnalysis.cc
+
+#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@
diff --git a/src/llvmopencl/Makefile.in b/src/llvmopencl/Makefile.in
new file mode 100644
index 0000000..e4dd24b
--- /dev/null
+++ b/src/llvmopencl/Makefile.in
@@ -0,0 +1,822 @@
+# Makefile.in generated by automake 1.14 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/llvmopencl.
+#
+# Copyright (c) 2011 Universidad Rey Juan Carlos
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+ case $${target_option-} in \
+ ?) ;; \
+ *) echo "am__make_running_with_option: internal error: invalid" \
+ "target option '$${target_option-}' specified" >&2; \
+ exit 1;; \
+ esac; \
+ has_opt=no; \
+ sane_makeflags=$$MAKEFLAGS; \
+ if $(am__is_gnu_make); then \
+ sane_makeflags=$$MFLAGS; \
+ else \
+ case $$MAKEFLAGS in \
+ *\\[\ \ ]*) \
+ bs=\\; \
+ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+ | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
+ esac; \
+ fi; \
+ skip_next=no; \
+ strip_trailopt () \
+ { \
+ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+ }; \
+ for flg in $$sane_makeflags; do \
+ test $$skip_next = yes && { skip_next=no; continue; }; \
+ case $$flg in \
+ *=*|--*) continue;; \
+ -*I) strip_trailopt 'I'; skip_next=yes;; \
+ -*I?*) strip_trailopt 'I';; \
+ -*O) strip_trailopt 'O'; skip_next=yes;; \
+ -*O?*) strip_trailopt 'O';; \
+ -*l) strip_trailopt 'l'; skip_next=yes;; \
+ -*l?*) strip_trailopt 'l';; \
+ -[dEDm]) skip_next=yes;; \
+ -[JT]) skip_next=yes;; \
+ esac; \
+ case $$flg in \
+ *$$target_option*) has_opt=yes; break;; \
+ esac; \
+ done; \
+ test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = lib/llvmopencl
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+ $(top_srcdir)/config/depcomp
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
+ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+ $(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
+ $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+ $(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+ *) f=$$p;; \
+ esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+ for p in $$list; do echo "$$p $$p"; done | \
+ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+ if (++n[$$2] == $(am__install_max)) \
+ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+ END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+ test -z "$$files" \
+ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+ $(am__cd) "$$dir" && rm -f $$files; }; \
+ }
+am__installdirs = "$(DESTDIR)$(pkglibdir)"
+LTLIBRARIES = $(pkglib_LTLIBRARIES)
+llvmopencl_la_DEPENDENCIES =
+am_llvmopencl_la_OBJECTS = BarrierBlock.lo Kernel.lo ParallelRegion.lo \
+ CanonicalizeBarriers.lo LoopBarriers.lo GenerateHeader.lo \
+ Workgroup.lo BarrierTailReplication.lo Flatten.lo \
+ IsolateRegions.lo WorkitemReplication.lo \
+ ImplicitLoopBarriers.lo WorkItemAliasAnalysis.lo \
+ WIVectorize.lo WorkitemHandler.lo WorkitemLoops.lo \
+ PHIsToAllocas.lo BreakConstantGEPs.lo \
+ WorkitemHandlerChooser.lo AllocasToEntry.lo \
+ TargetAddressSpaces.lo LLVMUtils.lo \
+ VariableUniformityAnalysis.lo
+llvmopencl_la_OBJECTS = $(am_llvmopencl_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo " GEN " $@;
+am__v_GEN_1 =
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/config/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CXXFLAGS) $(CXXFLAGS)
+AM_V_CXX = $(am__v_CXX_@AM_V@)
+am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
+am__v_CXX_0 = @echo " CXX " $@;
+am__v_CXX_1 =
+CXXLD = $(CXX)
+CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
+ $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
+am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
+am__v_CXXLD_0 = @echo " CXXLD " $@;
+am__v_CXXLD_1 =
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo " CC " $@;
+am__v_CC_1 =
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo " CCLD " $@;
+am__v_CCLD_1 =
+SOURCES = $(llvmopencl_la_SOURCES)
+DIST_SOURCES = $(llvmopencl_la_SOURCES)
+am__can_run_installinfo = \
+ case $$AM_UPDATE_INFO_DIR in \
+ n|no|NO) false;; \
+ *) (install-info --version) >/dev/null 2>&1;; \
+ esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates. Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+ BEGIN { nonempty = 0; } \
+ { items[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique. This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+ list='$(am__tagged_files)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
+BOOST_LDFLAGS = @BOOST_LDFLAGS@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CLANG = @CLANG@
+CLANGXX = @CLANGXX@
+CLFLAGS = @CLFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+GLEW_CFLAGS = @GLEW_CFLAGS@
+GLEW_LIBS = @GLEW_LIBS@
+GREP = @GREP@
+HOST = @HOST@
+HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
+HOST_CPU = @HOST_CPU@
+HOST_LD_FLAGS = @HOST_LD_FLAGS@
+HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
+HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
+HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
+HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
+HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
+HWLOC_CFLAGS = @HWLOC_CFLAGS@
+HWLOC_LIBS = @HWLOC_LIBS@
+ICD_LD_FLAGS = @ICD_LD_FLAGS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LD_FLAGS_BIN = @LD_FLAGS_BIN@
+LIBOBJS = @LIBOBJS@
+LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
+LIBS = @LIBS@
+LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
+LIBSPE_LIBS = @LIBSPE_LIBS@
+LIBTOOL = @LIBTOOL@
+LIB_VERSION = @LIB_VERSION@
+LIPO = @LIPO@
+LLC = @LLC@
+LLVM_AR = @LLVM_AR@
+LLVM_AS = @LLVM_AS@
+LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_LINK = @LLVM_LINK@
+LLVM_OPT = @LLVM_OPT@
+LLVM_RANLIB = @LLVM_RANLIB@
+LLVM_VERSION = @LLVM_VERSION@
+LN_S = @LN_S@
+LTDL_LIBS = @LTDL_LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
+OCL_ICD_LIBS = @OCL_ICD_LIBS@
+OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
+OCL_TARGETS = @OCL_TARGETS@
+OPENCL_CFLAGS = @OPENCL_CFLAGS@
+OPENCL_CMAKE = @OPENCL_CMAKE@
+OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
+OPENCL_LIBS = @OPENCL_LIBS@
+OPT = @OPT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+POAT_TESTSUITES = @POAT_TESTSUITES@
+POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TARGET = @TARGET@
+TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
+TARGET_CPU = @TARGET_CPU@
+TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
+TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
+TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
+TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
+TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
+TCECC = @TCECC@
+TCE_AVAILABLE = @TCE_AVAILABLE@
+TCE_CONFIG = @TCE_CONFIG@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+pkglib_LTLIBRARIES = llvmopencl.la
+AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags`
+AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags`
+llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION}
+llvmopencl_la_SOURCES = Barrier.h \
+ BarrierBlock.h BarrierBlock.cc \
+ Kernel.h Kernel.cc \
+ ParallelRegion.h ParallelRegion.cc \
+ CanonicalizeBarriers.h CanonicalizeBarriers.cc \
+ LoopBarriers.h LoopBarriers.cc \
+ GenerateHeader.cc Workgroup.h Workgroup.cc \
+ BarrierTailReplication.h BarrierTailReplication.cc \
+ Flatten.cc IsolateRegions.h IsolateRegions.cc \
+ WorkitemReplication.h WorkitemReplication.cc \
+ ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \
+ WorkItemAliasAnalysis.cc WIVectorize.cc \
+ WorkitemHandler.h WorkitemHandler.cc \
+ WorkitemLoops.h WorkitemLoops.cc \
+ PHIsToAllocas.h PHIsToAllocas.cc \
+ BreakConstantGEPs.h BreakConstantGEPs.cpp \
+ WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \
+ AllocasToEntry.h AllocasToEntry.cc \
+ TargetAddressSpaces.h TargetAddressSpaces.cc \
+ LLVMUtils.cc LLVMUtils.h \
+ VariableUniformityAnalysis.h VariableUniformityAnalysis.cc
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .cc .cpp .lo .o .obj
+$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
+ @for dep in $?; do \
+ case '$(am__configure_deps)' in \
+ *$$dep*) \
+ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+ && { if test -f $@; then exit 0; else break; fi; }; \
+ exit 1;; \
+ esac; \
+ done; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/llvmopencl/Makefile'; \
+ $(am__cd) $(top_srcdir) && \
+ $(AUTOMAKE) --foreign lib/llvmopencl/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ @case '$?' in \
+ *config.status*) \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+ *) \
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+ esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: $(am__configure_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): $(am__aclocal_m4_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES)
+ @$(NORMAL_INSTALL)
+ @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \
+ list2=; for p in $$list; do \
+ if test -f $$p; then \
+ list2="$$list2 $$p"; \
+ else :; fi; \
+ done; \
+ test -z "$$list2" || { \
+ echo " $(MKDIR_P) '$(DESTDIR)$(pkglibdir)'"; \
+ $(MKDIR_P) "$(DESTDIR)$(pkglibdir)" || exit 1; \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(pkglibdir)'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(pkglibdir)"; \
+ }
+
+uninstall-pkglibLTLIBRARIES:
+ @$(NORMAL_UNINSTALL)
+ @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \
+ for p in $$list; do \
+ $(am__strip_dir) \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(pkglibdir)/$$f'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(pkglibdir)/$$f"; \
+ done
+
+clean-pkglibLTLIBRARIES:
+ -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES)
+ @list='$(pkglib_LTLIBRARIES)'; \
+ locs=`for p in $$list; do echo $$p; done | \
+ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+ sort -u`; \
+ test -z "$$locs" || { \
+ echo rm -f $${locs}; \
+ rm -f $${locs}; \
+ }
+
+llvmopencl.la: $(llvmopencl_la_OBJECTS) $(llvmopencl_la_DEPENDENCIES) $(EXTRA_llvmopencl_la_DEPENDENCIES)
+ $(AM_V_CXXLD)$(CXXLINK) -rpath $(pkglibdir) $(llvmopencl_la_OBJECTS) $(llvmopencl_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT)
+
+distclean-compile:
+ -rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/AllocasToEntry.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierBlock.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierTailReplication.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BreakConstantGEPs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/CanonicalizeBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Flatten.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GenerateHeader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ImplicitLoopBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/IsolateRegions.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Kernel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LLVMUtils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LoopBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/PHIsToAllocas.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ParallelRegion.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TargetAddressSpaces.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/VariableUniformityAnalysis.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WIVectorize.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkItemAliasAnalysis.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Workgroup.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandler.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandlerChooser.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemLoops.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemReplication.Plo@am__quote@
+
+.cc.o:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cc.obj:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.cc.lo:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+
+.cpp.o:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cpp.obj:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.cpp.lo:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+ $(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+ set x; \
+ here=`pwd`; \
+ $(am__define_uniq_tagged_files); \
+ shift; \
+ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+ test -n "$$unique" || unique=$$empty_fix; \
+ if test $$# -gt 0; then \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ "$$@" $$unique; \
+ else \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$unique; \
+ fi; \
+ fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+ $(am__define_uniq_tagged_files); \
+ test -z "$(CTAGS_ARGS)$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && $(am__cd) $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+ list='$(am__tagged_files)'; \
+ case "$(srcdir)" in \
+ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+ *) sdir=$(subdir)/$(srcdir) ;; \
+ esac; \
+ for i in $$list; do \
+ if test -f "$$i"; then \
+ echo "$(subdir)/$$i"; \
+ else \
+ echo "$$sdir/$$i"; \
+ fi; \
+ done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ list='$(DISTFILES)'; \
+ dist_files=`for file in $$list; do echo $$file; done | \
+ sed -e "s|^$$srcdirstrip/||;t" \
+ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+ case $$dist_files in \
+ */*) $(MKDIR_P) `echo "$$dist_files" | \
+ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+ sort -u` ;; \
+ esac; \
+ for file in $$dist_files; do \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ if test -d $$d/$$file; then \
+ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test -d "$(distdir)/$$file"; then \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ else \
+ test -f "$(distdir)/$$file" \
+ || cp -p $$d/$$file "$(distdir)/$$file" \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+ for dir in "$(DESTDIR)$(pkglibdir)"; do \
+ test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+ done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ if test -z '$(STRIP)'; then \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ install; \
+ else \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+ fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \
+ mostlyclean-am
+
+distclean: distclean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-pkglibLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-pkglibLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+ clean-libtool clean-pkglibLTLIBRARIES cscopelist-am ctags \
+ ctags-am distclean distclean-compile distclean-generic \
+ distclean-libtool distclean-tags distdir dvi dvi-am html \
+ html-am info info-am install install-am install-data \
+ install-data-am install-dvi install-dvi-am install-exec \
+ install-exec-am install-html install-html-am install-info \
+ install-info-am install-man install-pdf install-pdf-am \
+ install-pkglibLTLIBRARIES install-ps install-ps-am \
+ install-strip installcheck installcheck-am installdirs \
+ maintainer-clean maintainer-clean-generic mostlyclean \
+ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+ pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am \
+ uninstall-pkglibLTLIBRARIES
+
+
+#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/src/llvmopencl/PHIsToAllocas.cc b/src/llvmopencl/PHIsToAllocas.cc
new file mode 100644
index 0000000..a414412
--- /dev/null
+++ b/src/llvmopencl/PHIsToAllocas.cc
@@ -0,0 +1,144 @@
+// LLVM function pass to convert all PHIs to allocas.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "PHIsToAllocas.h"
+#include "Workgroup.h"
+#include "WorkitemHandlerChooser.h"
+#include "WorkitemLoops.h"
+
+#include "config.h"
+
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#endif
+
+namespace {
+ static
+ llvm::RegisterPass<pocl::PHIsToAllocas> X(
+ "phistoallocas", "Convert all PHI nodes to allocas");
+}
+
+namespace pocl {
+
+char PHIsToAllocas::ID = 0;
+
+using namespace llvm;
+
+void
+PHIsToAllocas::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+ AU.addPreserved<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+PHIsToAllocas::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ /* Skip PHIsToAllocas when we are not creating the work item loops,
+ as leads to worse code without benefits for the full replication method.
+ */
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
+ return false;
+
+ typedef std::vector<llvm::Instruction* > InstructionVec;
+
+ InstructionVec PHIs;
+
+ for (Function::iterator bb = F.begin(); bb != F.end(); ++bb) {
+ for (BasicBlock::iterator p = bb->begin();
+ p != bb->end(); ++p)
+ {
+ Instruction* instr = p;
+ if (isa<PHINode>(instr))
+ {
+ PHIs.push_back(instr);
+ }
+ }
+
+ }
+
+ bool changed = false;
+ for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end();
+ ++i)
+ {
+ Instruction *instr = *i;
+ BreakPHIToAllocas(dyn_cast<PHINode>(instr));
+ changed = true;
+ }
+ return changed;
+
+}
+
+/**
+ * Convert a PHI to a read from a stack value and all the sources to
+ * writes to the same stack value.
+ *
+ * Used to fix context save/restore issues with regions with PHI nodes in the
+ * entry node (usually due to the use of work group scope variables such as
+ * B-loop iteration variables). In case of PHI nodes at region entries, we cannot
+ * just insert the context restore code because it is assumed there are no
+ * non-phi Instructions before PHIs which the context restore
+ * code constitutes to. Secondly, in case the PHINode is at a
+ * region entry (e.g. a B-Loop) adding new basic blocks before it would
+ * break the assumption of single entry regions.
+ */
+llvm::Instruction *
+PHIsToAllocas::BreakPHIToAllocas(PHINode* phi)
+{
+ std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
+
+ llvm::Function *function = phi->getParent()->getParent();
+ IRBuilder<> builder(function->getEntryBlock().getFirstInsertionPt());
+
+ llvm::Instruction *alloca =
+ builder.CreateAlloca(phi->getType(), 0, allocaName);
+
+ for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+ ++incoming)
+ {
+ Value *val = phi->getIncomingValue(incoming);
+ BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
+ builder.SetInsertPoint(incomingBB->getTerminator());
+ builder.CreateStore(val, alloca);
+ }
+
+ builder.SetInsertPoint(phi);
+
+ llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
+ phi->replaceAllUsesWith(loadedValue);
+ phi->eraseFromParent();
+ return loadedValue;
+}
+
+
+}
diff --git a/src/llvmopencl/PHIsToAllocas.h b/src/llvmopencl/PHIsToAllocas.h
new file mode 100644
index 0000000..819dcfc
--- /dev/null
+++ b/src/llvmopencl/PHIsToAllocas.h
@@ -0,0 +1,56 @@
+// Header for PHIsToAllocas function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_PHIS_TO_ALLOCAS_H
+#define _POCL_PHIS_TO_ALLOCAS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+#include "llvm/Pass.h"
+
+namespace llvm {
+ class Instruction;
+ class PHINode;
+}
+
+namespace pocl {
+ class Workgroup;
+
+ class PHIsToAllocas : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ PHIsToAllocas() : llvm::FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ llvm::Instruction *BreakPHIToAllocas(llvm::PHINode* phi);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/ParallelRegion.cc b/src/llvmopencl/ParallelRegion.cc
new file mode 100644
index 0000000..72d89c1
--- /dev/null
+++ b/src/llvmopencl/ParallelRegion.cc
@@ -0,0 +1,809 @@
+// Class definition for parallel regions, a group of BasicBlocks that
+// each kernel should run in parallel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "ParallelRegion.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include <set>
+#include <sstream>
+#include <map>
+#include <algorithm>
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+//#define DEBUG_REMAP
+//#define DEBUG_REPLICATE
+//#define DEBUG_PURGE
+
+#include <iostream>
+
+int ParallelRegion::idGen = 0;
+
+
+ParallelRegion::ParallelRegion(int forcedRegionId) :
+ std::vector<llvm::BasicBlock *>(),
+ LocalIDXLoadInstr(NULL), LocalIDYLoadInstr(NULL), LocalIDZLoadInstr(NULL),
+ exitIndex_(0), entryIndex_(0), pRegionId(forcedRegionId)
+{
+ if (forcedRegionId == -1)
+ pRegionId = idGen++;
+}
+
+/**
+ * Ensure all variables are named so they will be replicated and renamed
+ * correctly.
+ */
+void
+ParallelRegion::GenerateTempNames(llvm::BasicBlock *bb)
+{
+ for (llvm::BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i)
+ {
+ llvm::Instruction *instr = i;
+ if (instr->hasName() || !instr->isUsedOutsideOfBlock(bb)) continue;
+ int tempCounter = 0;
+ std::string tempName = "";
+ do {
+ std::ostringstream name;
+ name << ".pocl_temp." << tempCounter;
+ ++tempCounter;
+ tempName = name.str();
+ } while (bb->getParent()->getValueSymbolTable().lookup(tempName) != NULL);
+ instr->setName(tempName);
+ }
+}
+
+// BarrierBlock *
+// ParallelRegion::getEntryBarrier()
+// {
+// BasicBlock *entry = front();
+// BasicBlock *barrier = entry->getSinglePredecessor();
+
+// return cast<BarrierBlock> (barrier);
+// }
+
+ParallelRegion *
+ParallelRegion::replicate(ValueToValueMapTy &map,
+ const Twine &suffix = "")
+{
+ ParallelRegion *new_region = new ParallelRegion(pRegionId);
+
+ /* Because ParallelRegions are all replicated before they
+ are attached to the function, it can happen that
+ the same BB is replicated multiple times and it gets
+ the same name (only the BB name will be autorenamed
+ by LLVM). This causes the variable references to become
+ broken. This hack ensures the BB suffixes are unique
+ before cloning so each path gets their own value
+ names. Split points can be such paths.*/
+ static std::map<std::string, int> cloneCounts;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock *block = *i;
+ GenerateTempNames(block);
+ std::ostringstream suf;
+ suf << suffix.str();
+ std::string block_name = block->getName().str() + "." + suffix.str();
+ if (cloneCounts[block_name] > 0)
+ {
+ suf << ".pocl_" << cloneCounts[block_name];
+ }
+ BasicBlock *new_block = CloneBasicBlock(block, map, suf.str());
+ cloneCounts[block_name]++;
+ // Insert the block itself into the map.
+ map[block] = new_block;
+ new_region->push_back(new_block);
+
+#ifdef DEBUG_REPLICATE
+ std::cerr << "### clonee block:" << std::endl;
+ block->dump();
+ std::cerr << endl << "### cloned block: " << std::endl;
+ new_block->dump();
+#endif
+ }
+
+ new_region->exitIndex_ = exitIndex_;
+ new_region->entryIndex_ = entryIndex_;
+ /* Remap here to get local variables fixed before they
+ are (possibly) overwritten by another clone of the
+ same BB. */
+ new_region->remap(map);
+
+#ifdef DEBUG_REPLICATE
+ Verify();
+#endif
+ LocalizeIDLoads();
+
+ return new_region;
+}
+
+void
+ParallelRegion::remap(ValueToValueMapTy &map)
+{
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+
+#ifdef DEBUG_REMAP
+ std::cerr << "### block before remap:" << std::endl;
+ (*i)->dump();
+#endif
+
+ for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end();
+ ii != ee; ++ii)
+ RemapInstruction(ii, map,
+ RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+
+#ifdef DEBUG_REMAP
+ std::cerr << endl << "### block after remap: " << std::endl;
+ (*i)->dump();
+#endif
+ }
+}
+
+void
+ParallelRegion::chainAfter(ParallelRegion *region)
+{
+ /* If we are replicating a conditional barrier
+ region, the last block can be an unreachable
+ block to mark the impossible path. Skip
+ it and choose the correct branch instead.
+
+ TODO: why have the unreachable block there the
+ first place? Could we just not add it and fix
+ the branch? */
+ BasicBlock *tail = region->exitBB();
+ TerminatorInst *t = tail->getTerminator();
+ if (isa<UnreachableInst>(t))
+ {
+ tail = region->at(region->size() - 2);
+ t = tail->getTerminator();
+ }
+ if (t->getNumSuccessors() != 1)
+ {
+ std::cout << "!!! trying to chain region" << std::endl;
+ this->dumpNames();
+ std::cout << "!!! after region" << std::endl;
+ region->dumpNames();
+ t->getParent()->dump();
+
+ assert (t->getNumSuccessors() == 1);
+ }
+
+ BasicBlock *successor = t->getSuccessor(0);
+ Function::BasicBlockListType &bb_list =
+ successor->getParent()->getBasicBlockList();
+
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ bb_list.insertAfter(tail, *i);
+
+ t->setSuccessor(0, entryBB());
+
+ t = exitBB()->getTerminator();
+ assert (t->getNumSuccessors() == 1);
+ t->setSuccessor(0, successor);
+}
+
+void
+ParallelRegion::purge()
+{
+ SmallVector<BasicBlock *, 4> new_blocks;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+
+ // Exit block has a successor out of the region.
+ if (*i == exitBB())
+ continue;
+
+#ifdef DEBUG_PURGE
+ std::cerr << "### block before purge:" << std::endl;
+ (*i)->dump();
+#endif
+ TerminatorInst *t = (*i)->getTerminator();
+ for (unsigned ii = 0, ee = t->getNumSuccessors(); ii != ee; ++ii) {
+ BasicBlock *successor = t->getSuccessor(ii);
+ if (count(begin(), end(), successor) == 0) {
+ // This successor is not on the parallel region, purge.
+ iterator next_block = i;
+ ++next_block;
+ assert ((*i)->getParent() != NULL && *next_block != NULL);
+ BasicBlock *unreachable =
+ BasicBlock::Create((*i)->getContext(),
+ (*i)->getName() + ".unreachable",
+ (*i)->getParent(),
+ *next_block);
+ new UnreachableInst(unreachable->getContext(),
+ unreachable);
+ t->setSuccessor(ii, unreachable);
+ new_blocks.push_back(unreachable);
+ }
+ }
+#ifdef DEBUG_PURGE
+ std::cerr << std::endl << "### block after purge:" << std::endl;
+ (*i)->dump();
+#endif
+ }
+
+ // Add the new "unreachable" blocks to the
+ // region. We cannot do in the loop as it
+ // corrupts iterators.
+ insert(end(), new_blocks.begin(), new_blocks.end());
+}
+
+void
+ParallelRegion::insertLocalIdInit(llvm::BasicBlock* entry,
+ unsigned x,
+ unsigned y,
+ unsigned z)
+{
+ IRBuilder<> builder(entry, entry->getFirstInsertionPt());
+
+ Module *M = entry->getParent()->getParent();
+
+ int size_t_width = 32;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ GlobalVariable *gvx = M->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL);
+ if (gvx != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ x), gvx);
+
+ GlobalVariable *gvy = M->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL);
+ if (gvy != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ y), gvy);
+
+ GlobalVariable *gvz = M->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL);
+ if (gvz != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ z), gvz);
+}
+
+void
+ParallelRegion::insertPrologue(unsigned x,
+ unsigned y,
+ unsigned z)
+{
+ BasicBlock *entry = entryBB();
+ ParallelRegion::insertLocalIdInit(entry, x, y, z);
+}
+
+void
+ParallelRegion::dump()
+{
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ (*i)->dump();
+}
+
+void
+ParallelRegion::dumpNames()
+{
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ {
+ std::cout << (*i)->getName().str();
+ if (entryBB() == (*i))
+ std::cout << "(EN)";
+ if (exitBB() == (*i))
+ std::cout << "(EX)";
+ std::cout << " ";
+ }
+ std::cout << std::endl;
+}
+
+ParallelRegion *
+ParallelRegion::Create(const SmallPtrSet<BasicBlock *, 8>& bbs, BasicBlock *entry, BasicBlock *exit)
+{
+ ParallelRegion *new_region = new ParallelRegion();
+
+ assert (entry != NULL);
+ assert (exit != NULL);
+
+ // This is done in two steps so order of the vector
+ // is the same as original function order.
+ Function *F = entry->getParent();
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ BasicBlock *b = i;
+ for (SmallPtrSetIterator<BasicBlock *> j = bbs.begin(); j != bbs.end(); ++j) {
+ if (*j == b) {
+ new_region->push_back(i);
+ if (entry == *j)
+ new_region->setEntryBBIndex(new_region->size() - 1);
+ else if (exit == *j)
+ new_region->setExitBBIndex(new_region->size() - 1);
+ break;
+ }
+ }
+ }
+
+ new_region->LocalizeIDLoads();
+
+ assert(new_region->Verify());
+
+ return new_region;
+}
+
+bool
+ParallelRegion::Verify()
+{
+ // Parallel region conditions:
+ // 1) Single entry, in entry block.
+ // 2) Single outgoing edge from exit block
+ // (other outgoing edges allowed, will be purged in replicas).
+ // 3) No barriers inside the region.
+
+ int entry_edges = 0;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ for (pred_iterator ii(*i), ee(*i, true); ii != ee; ++ii) {
+ if (count(begin(), end(), *ii) == 0) {
+ if ((*i) != entryBB()) {
+ dumpNames();
+ std::cerr << "suspicious block: " << (*i)->getName().str() << std::endl;
+ std::cerr << "the entry is: " << entryBB()->getName().str() << std::endl;
+
+#if 0
+ (*i)->getParent()->viewCFG();
+#endif
+ assert(0 && "Incoming edges to non-entry block!");
+ return false;
+ } else if (!Barrier::hasBarrier(*ii)) {
+ (*i)->getParent()->viewCFG();
+ assert (0 && "Entry has edges from non-barrier blocks!");
+ return false;
+ }
+ ++entry_edges;
+ }
+ }
+
+ // if (entry_edges != 1) {
+ // assert(0 && "Parallel regions must be single entry!");
+ // return false;
+ // }
+
+ if (exitBB()->getTerminator()->getNumSuccessors() != 1) {
+ assert(0 && "Multiple outgoing edges from exit block!");
+ return false;
+ }
+
+ for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end();
+ ii != ee; ++ii) {
+ if (isa<Barrier> (ii)) {
+ assert(0 && "Barrier found inside parallel region!");
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Adds metadata to all the memory instructions to denote
+ * they originate from a parallel loop.
+ *
+ * Due to nested parallel loops, there can be multiple loop
+ * references.
+ *
+ * Format:
+ * llvm.mem.parallel_loop_access !0
+ *
+ * !0 { metadata !0 }
+ *
+ * In a 2-nested loop:
+ *
+ * llvm.mem.parallel_loop_access !0
+ *
+ * !0 { metadata !1, metadata !2}
+ * !1 { metadata !1 }
+ * !2 { metadata !2 }
+ */
+void
+ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) {
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock* bb = *i;
+ for (BasicBlock::iterator ii = bb->begin(), ee = bb->end();
+ ii != ee; ii++) {
+ if (ii->mayReadOrWriteMemory()) {
+ std::vector<Value*> loopIds;
+ MDNode *oldIds = ii->getMetadata("llvm.mem.parallel_loop_access");
+ if (oldIds != NULL) {
+ for (unsigned i = 0; i < oldIds->getNumOperands(); ++i) {
+ loopIds.push_back(oldIds->getOperand(i));
+ }
+ }
+ loopIds.push_back(identifier);
+ ii->setMetadata("llvm.mem.parallel_loop_access",
+ MDNode::get(bb->getContext(), loopIds));
+ }
+ }
+ }
+}
+
+void
+ParallelRegion::AddIDMetadata(
+ llvm::LLVMContext& context,
+ std::size_t x,
+ std::size_t y,
+ std::size_t z) {
+
+ int counter = 1;
+ Value *v1[] = {
+ MDString::get(context, "WI_region"),
+ ConstantInt::get(Type::getInt32Ty(context), pRegionId)};
+ MDNode* mdRegion = MDNode::get(context, v1);
+ Value *v2[] = {
+ MDString::get(context, "WI_xyz"),
+ ConstantInt::get(Type::getInt32Ty(context), x),
+ ConstantInt::get(Type::getInt32Ty(context), y),
+ ConstantInt::get(Type::getInt32Ty(context), z)};
+ MDNode* mdXYZ = MDNode::get(context, v2);
+ Value *v[] = {
+ MDString::get(context, "WI_data"),
+ mdRegion,
+ mdXYZ};
+ MDNode* md = MDNode::get(context, v);
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock* bb = *i;
+ for (BasicBlock::iterator ii = bb->begin();
+ ii != bb->end(); ii++) {
+ Value *v3[] = {
+ MDString::get(context, "WI_counter"),
+ ConstantInt::get(Type::getInt32Ty(context), counter)};
+ MDNode* mdCounter = MDNode::get(context, v3);
+ counter++;
+ ii->setMetadata("wi", md);
+ ii->setMetadata("wi_counter", mdCounter);
+ }
+ }
+}
+
+
+/**
+ * Inserts a new basic block to the region, before an old basic block in
+ * the region.
+ *
+ * Assumes the inserted block to be before the other block in control
+ * flow, that is, there should be direct CFG edge from the block to the
+ * other.
+ */
+void
+ParallelRegion::AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before)
+{
+ llvm::BasicBlock *oldExit = exitBB();
+ ParallelRegion::iterator beforePos = find(begin(), end(), before);
+ ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit);
+ assert (beforePos != end());
+
+ /* The old exit node might is now pushed further, at most one position.
+ Whether this is the case, depends if the node was inserted before or
+ after that node in the vector. That is, if indexof(before) < indexof(oldExit). */
+ if (beforePos < oldExitPos) ++exitIndex_;
+
+ insert(beforePos, block);
+ /* The entryIndex_ should be still correct. In case the 'before' block
+ was an old entry node, the new one replaces it as an entry node at
+ the same index and the old one gets pushed forward. */
+}
+
+
+void
+ParallelRegion::AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after)
+{
+ llvm::BasicBlock *oldExit = exitBB();
+ ParallelRegion::iterator afterPos = find(begin(), end(), after);
+ ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit);
+ assert (afterPos != end());
+
+ /* The old exit node might be pushed further, at most one position.
+ Whether this is the case, depends if the node was inserted before or
+ after that node in the vector. That is, if indexof(before) < indexof(oldExit). */
+ if (afterPos < oldExitPos) ++exitIndex_;
+ afterPos++;
+ insert(afterPos, block);
+}
+
+bool
+ParallelRegion::HasBlock(llvm::BasicBlock *bb)
+{
+ return find(begin(), end(), bb) != end();
+}
+
+/**
+ * Find the instruction that loads the Z dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDZLoad()
+{
+ if (LocalIDZLoadInstr != NULL) return LocalIDZLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDZLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL));
+}
+
+/**
+ * Find the instruction that loads the Y dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDYLoad()
+{
+ if (LocalIDYLoadInstr != NULL) return LocalIDYLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDYLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL));
+}
+
+/**
+ * Find the instruction that loads the X dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDXLoad()
+{
+ if (LocalIDXLoadInstr != NULL) return LocalIDXLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDXLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL));
+}
+
+void
+ParallelRegion::InjectPrintF
+(llvm::Instruction *before, std::string formatStr,
+ std::vector<Value*>& params)
+{
+ IRBuilder<> builder(before);
+ llvm::Module *M = before->getParent()->getParent()->getParent();
+
+ llvm::Value *stringArg =
+ builder.CreateGlobalString(formatStr);
+
+ /* generated with help from http://llvm.org/demo/index.cgi */
+ Function* printfFunc = M->getFunction("printf");
+ if (printfFunc == NULL) {
+ PointerType* PointerTy_4 = PointerType::get(IntegerType::get(M->getContext(), 8), 0);
+
+ std::vector<Type*> FuncTy_6_args;
+ FuncTy_6_args.push_back(PointerTy_4);
+
+ FunctionType* FuncTy_6 =
+ FunctionType::get
+ (/*Result=*/IntegerType::get(M->getContext(), 32),
+ /*Params=*/FuncTy_6_args,
+ /*isVarArg=*/true);
+
+ printfFunc =
+ Function::Create
+ (/*Type=*/FuncTy_6,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"printf", M);
+ printfFunc->setCallingConv(CallingConv::C);
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+ AttrListPtr func_printf_PAL;
+#else
+ AttributeSet func_printf_PAL;
+#endif
+ {
+#ifdef LLVM_3_1
+ SmallVector<AttributeWithIndex, 4> Attrs;
+ AttributeWithIndex PAWI;
+ PAWI.Index = 1U;
+ PAWI.Attrs = Attribute::NoCapture;
+ Attrs.push_back(PAWI);
+ PAWI.Index = 4294967295U;
+ PAWI.Attrs = Attribute::NoUnwind;
+ Attrs.push_back(PAWI);
+ func_printf_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+#elif defined LLVM_3_2
+ SmallVector<AttributeWithIndex, 4> Attrs;
+ Attrs.push_back(AttributeWithIndex::get(M->getContext(), 1U, Attributes::NoCapture));
+ Attrs.push_back(AttributeWithIndex::get(M->getContext(), 4294967295U, Attributes::NoUnwind));
+ func_printf_PAL = AttrListPtr::get(M->getContext(), Attrs);
+#else
+ func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture);
+ func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind);
+#endif
+ }
+ printfFunc->setAttributes(func_printf_PAL);
+ }
+
+ std::vector<Constant*> const_ptr_8_indices;
+
+ ConstantInt* const_int64_9 = ConstantInt::get(M->getContext(), APInt(64, StringRef("0"), 10));
+ const_ptr_8_indices.push_back(const_int64_9);
+ const_ptr_8_indices.push_back(const_int64_9);
+ assert (isa<Constant>(stringArg));
+ Constant* const_ptr_8 =
+ ConstantExpr::getGetElementPtr
+ (cast<Constant>(stringArg), const_ptr_8_indices);
+
+ std::vector<Value*> args;
+ args.push_back(const_ptr_8);
+ args.insert(args.end(), params.begin(), params.end());
+
+ CallInst::Create(printfFunc, args, "", before);
+}
+
+void
+ParallelRegion::SetExitBB(llvm::BasicBlock *block)
+{
+ for (size_t i = 0; i < size(); ++i)
+ {
+ if (at(i) == block)
+ {
+ setExitBBIndex(i);
+ return;
+ }
+ }
+ assert (false && "The block was not found in the PRegion!");
+}
+
+/**
+ * Adds a printf to the end of the parallel region that prints the
+ * region ID and the work item ID.
+ *
+ * Useful for debugging control flow bugs.
+ */
+void
+ParallelRegion::InjectRegionPrintF()
+{
+ llvm::Module *M = entryBB()->getParent()->getParent();
+
+#if 0
+ // it should reuse equal strings anyways
+ const char* FORMAT_STR_VAR = ".pocl.pRegion_debug_str";
+ llvm::Value *stringArg = M->getGlobalVariable(FORMAT_STR_VAR);
+ if (stringArg == NULL)
+ {
+ IRBuilder<> builder(entryBB());
+ stringArg = builder.CreateGlobalString("PR %d WI %u %u %u\n", FORMAT_STR_VAR);
+ }
+#endif
+
+ ConstantInt* pRID = ConstantInt::get(M->getContext(), APInt(32, pRegionId, 10));
+ std::vector<Value*> params;
+ params.push_back(pRID);
+ params.push_back(LocalIDXLoad());
+ params.push_back(LocalIDYLoad());
+ params.push_back(LocalIDZLoad());
+
+ InjectPrintF(exitBB()->getTerminator(), "PR %d WI %u %u %u\n", params);
+
+}
+
+/**
+ * Adds a printf to the end of the parallel region that prints the
+ * hex contents of all named non-pointer variables.
+ *
+ * Useful for debugging data flow bugs.
+ */
+void
+ParallelRegion::InjectVariablePrintouts()
+{
+ for (ParallelRegion::iterator i = begin();
+ i != end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+ if (isa<PointerType>(instruction->getType()) ||
+ !instruction->hasName()) continue;
+ std::string name = instruction->getName().str();
+ std::vector<Value*> args;
+ IRBuilder<> builder(exitBB()->getTerminator());
+ args.push_back(builder.CreateGlobalString(name));
+ args.push_back(instruction);
+ InjectPrintF(instruction->getParent()->getTerminator(), "variable %s == %x\n", args);
+ }
+ }
+}
+
+/**
+ * Localizes all the loads to the the work-item identifiers.
+ *
+ * In case the code inside the region queries the WI id, it
+ * should not (re)use one that is loaded in another region, but
+ * one that is loaded in the same region. Otherwise, it ends
+ * up using the last id the previous PR work-item loop got.
+ * This caused problems in cases where the local id was stored
+ * to a temporary variable in an earlier region and that temp
+ * was reused later.
+ *
+ * The function scans for all loads from the local id variables
+ * and converts them to loads inside the parallel region.
+ */
+void
+ParallelRegion::LocalizeIDLoads()
+{
+ /* The local id loads inside the parallel region. */
+ llvm::Instruction* LocalIDXLoadInstr = LocalIDXLoad();
+ llvm::Instruction* LocalIDYLoadInstr = LocalIDYLoad();
+ llvm::Instruction* LocalIDZLoadInstr = LocalIDZLoad();
+ llvm::Module *M = LocalIDXLoadInstr->getParent()->getParent()->getParent();
+ llvm::Value *localIdZ = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL);
+ llvm::Value *localIdY = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL);
+ llvm::Value *localIdX = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL);
+
+ assert (localIdZ != NULL && localIdY != NULL && localIdX != NULL &&
+ "The local id globals were not created.");
+
+ for (ParallelRegion::iterator i = begin();
+ i != end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instrI = bb->begin();
+ instrI != bb->end(); ++instrI)
+ {
+ llvm::Instruction *instr = instrI;
+ if (instr == LocalIDXLoadInstr ||
+ instr == LocalIDYLoadInstr ||
+ instr == LocalIDZLoadInstr) continue;
+
+ /* Search all operands of the instruction. If any of them is
+ using a local id, replace it with the intra-PR load from the
+ id variable. */
+ for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr)
+ {
+ llvm::LoadInst *load =
+ dyn_cast<llvm::LoadInst>(instr->getOperand(opr));
+ if (load == NULL) continue;
+ if (load == LocalIDXLoadInstr ||
+ load == LocalIDYLoadInstr ||
+ load == LocalIDZLoadInstr) continue;
+
+ if (load->getPointerOperand() == localIdZ)
+ instr->setOperand(opr, LocalIDZLoadInstr);
+ if (load->getPointerOperand() == localIdY)
+ instr->setOperand(opr, LocalIDYLoadInstr);
+ if (load->getPointerOperand() == localIdX)
+ instr->setOperand(opr, LocalIDXLoadInstr);
+ }
+ }
+ }
+}
diff --git a/src/llvmopencl/ParallelRegion.h b/src/llvmopencl/ParallelRegion.h
new file mode 100644
index 0000000..9313983
--- /dev/null
+++ b/src/llvmopencl/ParallelRegion.h
@@ -0,0 +1,127 @@
+// Class definition for parallel regions, a group of BasicBlocks that
+// each kernel should run in parallel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_PARALLEL_REGION_H
+#define _POCL_PARALLEL_REGION_H
+
+#include "BarrierBlock.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/BasicBlock.h"
+#include "llvm/LLVMContext.h"
+#else
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/LLVMContext.h"
+#endif
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/SmallVector.h"
+#include <vector>
+
+namespace pocl {
+
+#define POCL_LOCAL_ID_X_GLOBAL "_local_id_x"
+#define POCL_LOCAL_ID_Y_GLOBAL "_local_id_y"
+#define POCL_LOCAL_ID_Z_GLOBAL "_local_id_z"
+
+class Kernel;
+
+ // TODO Cleanup: this should not inherit vector but contain it.
+ // It now exposes too much to the clients and leads to hard
+ // to track errors when the API is changed.
+ class ParallelRegion : public std::vector<llvm::BasicBlock *> {
+ public:
+ typedef llvm::SmallVector<ParallelRegion *, 8> ParallelRegionVector;
+
+ ParallelRegion(int forcedRegionId=-1);
+
+ /* BarrierBlock *getEntryBarrier(); */
+ ParallelRegion *replicate(llvm::ValueToValueMapTy &map,
+ const llvm::Twine &suffix);
+ void remap(llvm::ValueToValueMapTy &map);
+ void purge();
+ void chainAfter(ParallelRegion *region);
+ void insertPrologue(unsigned x, unsigned y, unsigned z);
+ static void insertLocalIdInit(llvm::BasicBlock* entry,
+ unsigned x,
+ unsigned y,
+ unsigned z);
+ void dump();
+ void dumpNames();
+ void setEntryBBIndex(std::size_t index) { entryIndex_ = index; }
+ void setExitBBIndex(std::size_t index) { exitIndex_ = index; }
+ void SetExitBB(llvm::BasicBlock *block);
+ void AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before);
+ void AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after);
+
+ llvm::BasicBlock* exitBB() { return at(exitIndex_); }
+ llvm::BasicBlock* entryBB() { return at(entryIndex_); }
+ void AddIDMetadata(llvm::LLVMContext& context,
+ std::size_t x = 0,
+ std::size_t y = 0,
+ std::size_t z = 0);
+
+ void AddParallelLoopMetadata(llvm::MDNode *identifier);
+
+ bool HasBlock(llvm::BasicBlock *bb);
+
+ void InjectRegionPrintF();
+ void InjectVariablePrintouts();
+
+ void InjectPrintF
+ (llvm::Instruction *before, std::string formatStr,
+ std::vector<llvm::Value*>& params);
+
+ static ParallelRegion *
+ Create(const llvm::SmallPtrSet<llvm::BasicBlock *, 8>& bbs,
+ llvm::BasicBlock *entry, llvm::BasicBlock *exit);
+
+ static void GenerateTempNames(llvm::BasicBlock *bb);
+
+ llvm::Instruction* LocalIDXLoad();
+ llvm::Instruction* LocalIDYLoad();
+ llvm::Instruction* LocalIDZLoad();
+
+ void LocalizeIDLoads();
+
+ private:
+ llvm::Instruction* LocalIDXLoadInstr;
+ llvm::Instruction* LocalIDYLoadInstr;
+ llvm::Instruction* LocalIDZLoadInstr;
+
+ bool Verify();
+ /// The indices of entry and exit, not pointers, for finding the BBs in the
+ /// replicated PRs too.
+ std::size_t exitIndex_;
+ std::size_t entryIndex_;
+
+ /// Identifier for the parallel region.
+ int pRegionId;
+ static int idGen;
+
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/TargetAddressSpaces.cc b/src/llvmopencl/TargetAddressSpaces.cc
new file mode 100644
index 0000000..bd860cc
--- /dev/null
+++ b/src/llvmopencl/TargetAddressSpaces.cc
@@ -0,0 +1,220 @@
+// Header for TargetAddressSpaces
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <iostream>
+#include <string>
+
+#ifdef LLVM_3_2
+# include <llvm/Instructions.h>
+#else
+# include <llvm/IR/Instructions.h>
+# include <llvm/IR/Module.h>
+
+#endif
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "TargetAddressSpaces.h"
+#include "Workgroup.h"
+#include "LLVMUtils.h"
+#include "pocl.h"
+
+#define DEBUG_TARGET_ADDRESS_SPACES
+
+namespace pocl {
+
+using namespace llvm;
+
+namespace {
+ static
+ RegisterPass<pocl::TargetAddressSpaces> X
+ ("target-address-spaces",
+ "Convert the 'fake' address space ids to the target specific ones.");
+}
+
+char TargetAddressSpaces::ID = 0;
+
+TargetAddressSpaces::TargetAddressSpaces() : ModulePass(ID) {
+}
+
+static Type *
+ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap) {
+
+ if (type->isPointerTy()) {
+ unsigned AS = type->getPointerAddressSpace();
+ unsigned newAS = addrSpaceMap[AS];
+ return PointerType::get(ConvertedType(type->getPointerElementType(), addrSpaceMap), newAS);
+ } else if (type->isArrayTy()) {
+ return ArrayType::get
+ (ConvertedType(type->getArrayElementType(), addrSpaceMap), type->getArrayNumElements());
+ } else { /* TODO: pointers inside structs */
+ return type;
+ }
+}
+
+static bool
+UpdateAddressSpace(llvm::Value& val, std::map<unsigned, unsigned> &addrSpaceMap) {
+ Type *type = val.getType();
+ if (!type->isPointerTy()) return false;
+
+ Type *newType = ConvertedType(type, addrSpaceMap);
+ if (newType == type) return false;
+
+ val.mutateType(newType);
+ return true;
+}
+
+
+bool
+TargetAddressSpaces::runOnModule(llvm::Module &M) {
+
+ std::string triple = M.getTargetTriple();
+ std::string arch = triple;
+ size_t dash = triple.find("-");
+ if (dash != std::string::npos) {
+ arch = triple.substr(0, dash);
+ }
+
+ std::map<unsigned, unsigned> addrSpaceMap;
+
+ if (arch == "x86_64") {
+ /* For x86_64 the default isel seems to work with the
+ fake address spaces. Skip the processing as it causes
+ an overhead and is not fully implemented.
+ */
+ return false;
+ } else if (arch == "tce") {
+ /* TCE requires the remapping. */
+ addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] = 3;
+ addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] = 4;
+ /* LLVM 3.2 detects 'constant' as cuda_constant (5) in the fake
+ address space map. Add it for compatibility. */
+ addrSpaceMap[5] = addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 5;
+
+ } else {
+ /* Assume the fake address space map works directly in case not
+ overridden here. */
+ return false;
+ }
+
+ bool changed = false;
+ /* Handle global variables. */
+ llvm::Module::global_iterator globalI = M.global_begin();
+ llvm::Module::global_iterator globalE = M.global_end();
+ for (; globalI != globalE; ++globalI) {
+ llvm::Value &global = *globalI;
+ changed |= UpdateAddressSpace(global, addrSpaceMap);
+ }
+
+ FunctionMapping funcReplacements;
+ std::vector<llvm::Function*> unhandledFuncs;
+
+ /* Collect the functions to process first because we add
+ a new function per modified function which invalidates
+ the Module's function iterator. */
+ for (llvm::Module::iterator functionI = M.begin(), functionE = M.end();
+ functionI != functionE; ++functionI) {
+ if (functionI->empty() || functionI->getName().startswith("_GLOBAL"))
+ continue;
+ unhandledFuncs.push_back(functionI);
+ }
+
+ for (std::vector<llvm::Function*>::iterator i = unhandledFuncs.begin(),
+ e = unhandledFuncs.end(); i != e; ++i) {
+ llvm::Function &F = **i;
+
+ /* Convert the FunctionType. Because there is no mutator API in
+ LLVM for this, we need to recreate the whole darn function :( */
+ SmallVector<Type *, 8> parameters;
+ for (Function::const_arg_iterator i = F.arg_begin(),
+ e = F.arg_end();
+ i != e; ++i)
+ parameters.push_back(ConvertedType(i->getType(), addrSpaceMap));
+
+ llvm::FunctionType *ft = FunctionType::get
+ (ConvertedType(F.getReturnType(), addrSpaceMap),
+ parameters, F.isVarArg());
+
+ llvm::Function *newFunc = Function::Create(ft, F.getLinkage(), "", &M);
+ newFunc->takeName(&F);
+
+ ValueToValueMapTy vv;
+ Function::arg_iterator j = newFunc->arg_begin();
+ for (Function::const_arg_iterator i = F.arg_begin(),
+ e = F.arg_end();
+ i != e; ++i) {
+ j->setName(i->getName());
+ vv[i] = j;
+ ++j;
+ }
+
+ SmallVector<ReturnInst *, 1> ri;
+
+ class AddressSpaceReMapper : public ValueMapTypeRemapper {
+ public:
+ AddressSpaceReMapper(std::map<unsigned, unsigned> &addrSpaceMap) :
+ addrSpaceMap_(addrSpaceMap) {}
+ Type* remapType(Type *type) {
+ Type *newType = ConvertedType(type, addrSpaceMap_);
+ if (newType == type) return type;
+ return newType;
+ }
+ private:
+ std::map<unsigned, unsigned>& addrSpaceMap_;
+ } asvtm(addrSpaceMap);
+
+ CloneFunctionInto(newFunc, &F, vv, true, ri, "", NULL, &asvtm);
+ funcReplacements[&F] = newFunc;
+ }
+
+ /* Replace all references to the old function to the new one. */
+ llvm::Module::iterator fI = M.begin();
+ llvm::Module::iterator fE = M.end();
+ for (; fI != fE; ++fI) {
+ llvm::Function &F = *fI;
+ for (llvm::Function::iterator bbi = F.begin(), bbe = F.end(); bbi != bbe;
+ ++bbi)
+ for (llvm::BasicBlock::iterator ii = bbi->begin(), ie = bbi->end(); ii != ie;
+ ++ii) {
+ llvm::Instruction *instr = ii;
+ if (!isa<CallInst>(instr)) continue;
+ llvm::CallInst *call = dyn_cast<CallInst>(instr);
+ llvm::Function *calledF = call->getCalledFunction();
+ if (funcReplacements.find(calledF) == funcReplacements.end()) continue;
+
+ call->setCalledFunction(funcReplacements[calledF]);
+ }
+ }
+
+ regenerate_kernel_metadata(M, funcReplacements);
+
+ /* Delete the old functions. */
+ for (FunctionMapping::iterator i = funcReplacements.begin(),
+ e = funcReplacements.end(); i != e; ++i) {
+ i->first->eraseFromParent();
+ }
+
+ return true;
+}
+
+}
diff --git a/src/llvmopencl/TargetAddressSpaces.h b/src/llvmopencl/TargetAddressSpaces.h
new file mode 100644
index 0000000..1a080c8
--- /dev/null
+++ b/src/llvmopencl/TargetAddressSpaces.h
@@ -0,0 +1,54 @@
+// Header for TargetAddressSpaces, an LLVM pass that converts the
+// generic address space ids to the target specific ones.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_TARGET_ADDRESS_SPACES_H
+#define _POCL_TARGET_ADDRESS_SPACES_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+
+namespace pocl {
+ /* pocl uses the fixed address space ids forced by the clang's
+ -ffake-address-space-map internally until the end to be able to
+ detect the different OpenCL address spaces ambiguously, regardless
+ of the target. This pass converts the fake address space ids to
+ the target-specific ones, if required by the code generator of that
+ target. */
+ class TargetAddressSpaces : public llvm::ModulePass {
+ public:
+ static char ID;
+
+ TargetAddressSpaces();
+ virtual ~TargetAddressSpaces() {};
+
+ virtual bool runOnModule(llvm::Module &M);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/VariableUniformityAnalysis.cc b/src/llvmopencl/VariableUniformityAnalysis.cc
new file mode 100644
index 0000000..4362524
--- /dev/null
+++ b/src/llvmopencl/VariableUniformityAnalysis.cc
@@ -0,0 +1,382 @@
+// Implementation for VariableUniformityAnalysis function pass.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#ifdef LLVM_3_2
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/DataLayout.h"
+#endif
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/PostDominators.h"
+
+#include "WorkitemHandler.h"
+#include "Kernel.h"
+#include "VariableUniformityAnalysis.h"
+#include "Barrier.h"
+
+//#define DEBUG_UNIFORMITY_ANALYSIS
+
+namespace pocl {
+
+char VariableUniformityAnalysis::ID = 0;
+
+using namespace llvm;
+
+static
+RegisterPass<VariableUniformityAnalysis> X(
+ "uniformity",
+ "Analyses the variables of the function for uniformity (same value across WIs).",
+ false, false);
+
+VariableUniformityAnalysis::VariableUniformityAnalysis() : FunctionPass(ID) {
+}
+
+
+void
+VariableUniformityAnalysis::getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+ AU.addRequired<PostDominatorTree>();
+ AU.addPreserved<PostDominatorTree>();
+ AU.addRequired<LoopInfo>();
+ AU.addPreserved<LoopInfo>();
+ // required by LoopInfo:
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+
+// TODO This was turned off because of compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+ AU.addPreserved<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+ AU.addPreserved<DataLayout>();
+#endif
+#endif
+}
+
+bool
+VariableUniformityAnalysis::runOnFunction(Function &F) {
+
+ /* Do the actual analysis on-demand except for the basic block
+ divergence analysis. */
+ uniformityCache_[&F].clear();
+
+ /* Mark the canonican induction variable PHI as uniform.
+ If there's a canonical induction variable in loops, the variable
+ update for each iteration should be uniform. Note: this does not yet imply
+ all the work-items execute the loop same number of times! */
+ llvm::LoopInfo &LI = getAnalysis<LoopInfo>();
+ for (llvm::LoopInfo::iterator i = LI.begin(), e = LI.end(); i != e; ++i) {
+ llvm::Loop *L = *i;
+ if (llvm::PHINode *inductionVar = L->getCanonicalInductionVariable()) {
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### canonical induction variable, assuming uniform:";
+ inductionVar->dump();
+#endif
+ setUniform(&F, inductionVar);
+ }
+ }
+
+ setUniform(&F, &F.getEntryBlock());
+ analyzeBBDivergence(&F, &F.getEntryBlock(), &F.getEntryBlock());
+ // F.viewCFG();
+ return false;
+}
+
+/**
+ * BB divergence analysis.
+ *
+ * Define:
+ * Uniform BB. A basic block which is known to be executed by all or none
+ * of the work-items, that is, a BB where it's known safe to add a barrier.
+ *
+ * Divergent/varying BB. A basic block where work-items *might* diverge.
+ * That is, it cannot be proven that all work-items execute the BB.
+ *
+ * Propagate the information from the entry downwards (breadth first).
+ * This avoids infinite recursion with loop back edges and enables
+ * to keep book of the "last seen" uniform BB.
+ *
+ * The conditions to mark a BB 'uniform':
+ *
+ * a) the function entry
+ * b) BBs that post-dominate at least one uniform BB (try the previously
+ * found one)
+ * c) BBs that are branched to directly from a uniform BB using a uniform branch.
+ *
+ * Otherwise, assume divergent (might not be *proven* to be one!).
+ *
+ */
+void
+VariableUniformityAnalysis::analyzeBBDivergence
+(llvm::Function *f, llvm::BasicBlock *bb, llvm::BasicBlock *previousUniformBB) {
+
+
+ llvm::BasicBlock *newPreviousUniformBB = previousUniformBB;
+
+ llvm::BranchInst *br =
+ dyn_cast<llvm::BranchInst>(previousUniformBB->getTerminator());
+
+ if (br == NULL) {
+ // this is most likely a function with a single basic block, the entry node, which
+ // ends with a ret
+ return;
+ }
+
+ // Condition c)
+ if ((!br->isConditional() || isUniform(f, br->getCondition()))) {
+ for (unsigned suc = 0, end = br->getNumSuccessors(); suc < end; ++suc) {
+ if (br->getSuccessor(suc) == bb) {
+ setUniform(f, bb, true);
+ newPreviousUniformBB = bb;
+ break;
+ }
+ }
+ }
+
+ // Condition b)
+ if (newPreviousUniformBB != bb) {
+ llvm::PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
+ if (PDT->dominates(bb, previousUniformBB)) {
+ setUniform(f, bb, true);
+ newPreviousUniformBB = bb;
+ }
+ }
+
+ /* Assume diverging. */
+ if (!isUniformityAnalyzed(f, bb))
+ setUniform(f, bb, false);
+
+ llvm::BranchInst *nextbr = dyn_cast<llvm::BranchInst>(bb->getTerminator());
+
+ if (nextbr == NULL) return; /* ret */
+
+ /* Propagate the data downward. */
+ for (unsigned suc = 0, end = nextbr->getNumSuccessors(); suc < end; ++suc) {
+ llvm::BasicBlock *nextbb = nextbr->getSuccessor(suc);
+ if (!isUniformityAnalyzed(f, nextbb)) {
+ analyzeBBDivergence(f, nextbb, newPreviousUniformBB);
+ }
+ }
+}
+
+bool
+VariableUniformityAnalysis::isUniformityAnalyzed(llvm::Function *f, llvm::Value *v) const {
+ UniformityIndex &cache = uniformityCache_[f];
+ UniformityIndex::const_iterator i = cache.find(v);
+ if (i != cache.end()) {
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Simple uniformity analysis that recursively analyses all the
+ * operands affecting the value.
+ *
+ * Known uniform Values:
+ * a) kernel arguments
+ * b) constants
+ *
+ */
+bool
+VariableUniformityAnalysis::isUniform(llvm::Function *f, llvm::Value* v) {
+
+ UniformityIndex &cache = uniformityCache_[f];
+ UniformityIndex::const_iterator i = cache.find(v);
+ if (i != cache.end()) {
+ return (*i).second;
+ }
+
+ if (llvm::BasicBlock *bb = dyn_cast<llvm::BasicBlock>(v)) {
+ if (bb == &f->getEntryBlock()) {
+ setUniform(f, v, true);
+ return true;
+ }
+ }
+
+ if (isa<llvm::Argument>(v)) {
+ setUniform(f, v, true);
+ return true;
+ }
+
+ if (isa<llvm::ConstantInt>(v)) {
+ setUniform(f, v, true);
+ return true;
+ }
+
+ if (isa<llvm::AllocaInst>(v)) {
+ /* Allocas might or might not be divergent. These are produced
+ from work-item private arrays or the PHIsToAllocas. It depends
+ what is written to them whether they are really divergent.
+
+ We need to figure out if any of the stores to the alloca contain
+ work-item id dependent data. Take a white listing approach that
+ detects the ex-phi allocas of loop iteration variables of non-diverging
+ loops.
+
+ Currently the following case is white listed:
+ a) are scalars
+ b) are accesses only with load and stores (e.g. address not taken)
+ c) stored data is uniform
+
+ Because alloca data can be modified in loops and thus be dependent on
+ itself, we need a bit involved mechanism to handle it. First create
+ a copy of the uniformity cache, then assume the alloca itself is uniform,
+ then check if all the stores to the alloca contain uniform data. If
+ our initial assumption was wrong, restore the cache from the backup.
+ */
+ UniformityCache backupCache(uniformityCache_);
+ setUniform(f, v);
+
+ bool isUniformAlloca = true;
+ llvm::Instruction *instruction = dyn_cast<llvm::AllocaInst>(v);
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui) {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+
+ llvm::StoreInst *store = dyn_cast<llvm::StoreInst>(user);
+ if (store) {
+ if (!isUniform(f, store->getValueOperand())) {
+ isUniformAlloca = false;
+ break;
+ }
+ } else if (dyn_cast<llvm::LoadInst>(user) != NULL) {
+ } else {
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### alloca has a suspicious user" << std::endl;
+ user->dump();
+#endif
+ isUniformAlloca = false;
+ break;
+ }
+ }
+
+ if (!isUniformAlloca) {
+ // restore the old uniform data as our guess was wrong
+ uniformityCache_ = backupCache;
+ }
+ setUniform(f, v, isUniformAlloca);
+
+ return isUniformAlloca;
+ }
+
+ /* TODO: global memory loads are uniform in case they are accessing
+ the higher scope ids (group_id_?). */
+ if (isa<llvm::LoadInst>(v)) {
+ llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(v);
+ llvm::Value *pointer = load->getPointerOperand();
+ llvm::Module *M = load->getParent()->getParent()->getParent();
+
+ if (pointer == M->getGlobalVariable("_group_id_x") ||
+ pointer == M->getGlobalVariable("_group_id_y") ||
+ pointer == M->getGlobalVariable("_group_id_z") ||
+ pointer == M->getGlobalVariable("_work_dim") ||
+ pointer == M->getGlobalVariable("_num_groups_x") ||
+ pointer == M->getGlobalVariable("_num_groups_y") ||
+ pointer == M->getGlobalVariable("_num_groups_z") ||
+ pointer == M->getGlobalVariable("_global_offset_x") ||
+ pointer == M->getGlobalVariable("_global_offset_y") ||
+ pointer == M->getGlobalVariable("_global_offset_z") ||
+ pointer == M->getGlobalVariable("_local_size_x") ||
+ pointer == M->getGlobalVariable("_local_size_y") ||
+ pointer == M->getGlobalVariable("_local_size_z")) {
+
+ setUniform(f, v, true);
+ return true;
+ }
+ }
+
+ if (isa<llvm::PHINode>(v)) {
+ /* TODO: PHINodes need control flow analysis:
+ even if the values are uniform, the selected
+ value depends on the preceeding basic block which
+ might depend on the ID. Assume they are not uniform
+ for now in general and treat the loop iteration
+ variable as a special case (set externally from a LoopPass).
+
+ TODO: PHINodes can depend (indirectly or directly) on itself in loops
+ so it would need infinite recursion checking.
+ */
+ setUniform(f, v, false);
+ return false;
+ }
+
+ llvm::Instruction *instr = dyn_cast<llvm::Instruction>(v);
+ if (instr == NULL) {
+ setUniform(f, v, false);
+ return false;
+ }
+ // not computed previously, scan all operands of the instruction
+ // and figure out their uniformity recursively
+ for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr) {
+ llvm::Value *operand = instr->getOperand(opr);
+ if (!isUniform(f, operand)) {
+ setUniform(f, v, false);
+ return false;
+ }
+ }
+ setUniform(f, v, true);
+ return true;
+}
+
+void
+VariableUniformityAnalysis::setUniform(llvm::Function *f,
+ llvm::Value *v,
+ bool isUniform) {
+
+ UniformityIndex &cache = uniformityCache_[f];
+ cache[v] = isUniform;
+
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### ";
+ if (isUniform)
+ std::cerr << "uniform ";
+ else
+ std::cerr << "varying ";
+
+ if (isa<llvm::BasicBlock>(v)) {
+ std::cerr << "BB: " << v->getName().str() << std::endl;
+ } else {
+ v->dump();
+ }
+#endif
+}
+
+}
diff --git a/src/llvmopencl/VariableUniformityAnalysis.h b/src/llvmopencl/VariableUniformityAnalysis.h
new file mode 100644
index 0000000..88175a8
--- /dev/null
+++ b/src/llvmopencl/VariableUniformityAnalysis.h
@@ -0,0 +1,70 @@
+// Header for VariableUniformityAnalysis function pass.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_VARIABLE_UNIFORMITY_ANALYSIS_H
+#define POCL_VARIABLE_UNIFORMITY_ANALYSIS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+
+namespace pocl {
+ /**
+ * Analyses the variables in the function to figure out if a variable
+ * value is
+ *
+ * a) 'uniform', i.e., always same for all work-items in the *same work-group*
+ * b) 'varying', i.e., somehow dependent on the work-item id
+ *
+ * For safety, 'variable' is assumed, unless certain of a).
+ */
+ class VariableUniformityAnalysis : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ VariableUniformityAnalysis();
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+ virtual bool isUniform(llvm::Function *f, llvm::Value* v);
+ virtual void setUniform(llvm::Function *f, llvm::Value *v, bool isUniform=true);
+ virtual void analyzeBBDivergence(llvm::Function *f,
+ llvm::BasicBlock *bb,
+ llvm::BasicBlock *previousUniformBB);
+
+ private:
+
+ bool isUniformityAnalyzed(llvm::Function *f, llvm::Value *val) const;
+
+ typedef std::map<llvm::Value*, bool> UniformityIndex;
+ typedef std::map<llvm::Function *, UniformityIndex> UniformityCache;
+ mutable UniformityCache uniformityCache_;
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WIVectorize.cc b/src/llvmopencl/WIVectorize.cc
new file mode 100644
index 0000000..e234392
--- /dev/null
+++ b/src/llvmopencl/WIVectorize.cc
@@ -0,0 +1,3252 @@
+//===- WIVectorize.cpp - A Work Item Vectorizer -------------------------===//
+//
+// This code has been adapted from BBVectorize of the LLVM project.
+// The original file comment:
+//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+//
+// WIVectorize:
+//
+// Additional options are provided to vectorize only candidate from differnt
+// work items according to metadata provided by 'pocl' frontend
+// (launchpad.net/pocl).
+//
+// Additional option is also available to vectorize loads and stores only.
+// Still work in progress by vladimir guzma [at] tut fi.
+//
+//===----------------------------------------------------------------------===//
+
+#define WIV_NAME "wi-vectorize"
+#define DEBUG_TYPE WIV_NAME
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Type.h"
+#include "llvm/Metadata.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Type.h"
+#include "llvm/Metadata.h"
+#include "llvm/TargetTransformInfo.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#endif
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+#include <iostream>
+using namespace llvm;
+
+static cl::opt<bool>
+IgnoreTargetInfo("wi-vectorize-ignore-target-info", cl::init(true),
+ cl::Hidden, cl::desc("Ignore target information"));
+
+static cl::opt<unsigned>
+ReqChainDepth("wi-vectorize-req-chain-depth", cl::init(3), cl::Hidden,
+ cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<unsigned>
+VectorWidth("wi-vectorize-vector-width", cl::init(8), cl::Hidden,
+ cl::desc("The width of the machine vector in words."));
+
+static cl::opt<bool>
+NoMath("wi-vectorize-no-math", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("wi-vectorize-no-fma", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoMemOps("wi-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("wi-vectorize-aligned-only", cl::init(false), cl::Hidden,
+ cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+MemOpsOnly("wi-vectorize-mem-ops-only", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize loads and stores only"));
+
+static cl::opt<bool>
+NoFP("wi-vectorize-no-fp", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point operations"));
+
+static cl::opt<bool>
+NoCMP("wi-vectorize-no-cmp", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize comparison operations"));
+
+static cl::opt<bool>
+NoCount("wi-vectorize-no-counters", cl::init(false), cl::Hidden,
+ cl::desc("Forbid vectorization based no loop counter "
+ "arithmetic"));
+static cl::opt<bool>
+NoGEP("wi-vectorize-no-GEP", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize getelementpointer operations"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("wi-vectorize-debug-instruction-examination",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("wi-vectorize-debug-candidate-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("wi-vectorize-debug-pair-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("wi-vectorize-debug-cycle-check",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " cycle-checking process"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by wi-vectorize");
+
+namespace llvm {
+ FunctionPass* createWIVectorizePass();
+}
+namespace {
+ struct WIVectorize : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ WIVectorize() : FunctionPass(ID) {}
+
+ typedef std::pair<Value *, Value *> ValuePair;
+ typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+ typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+ typedef std::pair<std::multimap<Value *, Value *>::iterator,
+ std::multimap<Value *, Value *>::iterator> VPIteratorPair;
+ typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
+ std::multimap<ValuePair, ValuePair>::iterator>
+ VPPIteratorPair;
+ typedef std::vector<Value *> ValueVector;
+ typedef DenseMap<Value*, ValueVector*> ValueVectorMap;
+
+ AliasAnalysis *AA;
+ ScalarEvolution *SE;
+#ifdef LLVM_3_1
+ TargetData *TD;
+#elif defined LLVM_3_2
+ DataLayout *TD;
+ TargetTransformInfo *TTI;
+ const VectorTargetTransformInfo *VTTI;
+#else
+ DataLayout *TD;
+ TargetTransformInfo *TTI;
+ const TargetTransformInfo *VTTI;
+#endif
+ DenseMap<Value*, Value*> storedSources;
+ DenseMap<std::pair<int,int>, ValueVector*> stridedOps;
+ std::multimap<Value*, Value*> flippedStoredSources;
+ // FIXME: const correct?
+
+ bool vectorizePairs(BasicBlock &BB);
+
+ bool vectorizePhiNodes(BasicBlock &BB);
+
+ bool vectorizeAllocas(BasicBlock& BB);
+
+ void replaceUses(BasicBlock& BB,
+ AllocaInst& oldAlloca,
+ AllocaInst& newAlloca,
+ int indx);
+
+ Type* newAllocaType(Type* start, unsigned int width);
+
+ bool removeDuplicates(BasicBlock &BB);
+
+ void dropUnused(BasicBlock& BB);
+
+ bool getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts);
+
+ bool getCandidateAllocas(BasicBlock &BB,
+ std::multimap<int, ValueVector *>& candidateAllocas);
+
+ void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+
+ void buildDepMap(BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers);
+
+ void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ void fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+ bool areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore);
+
+ bool areInstsCompatibleFromDifferentWi(Instruction *I, Instruction *J);
+
+ bool trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers = true,
+ std::multimap<Value *, Value *> *LoadMoveSet = 0);
+
+ void computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P);
+
+ bool pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+
+ bool pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUsers,
+ DenseSet<ValuePair> &CurrentPairs);
+
+ void pruneTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree,
+ DenseSet<ValuePair> &PrunedTree, ValuePair J,
+ bool UseCycleCheck);
+
+ void buildInitialTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree, ValuePair J);
+
+ void findBestTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+ size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ bool UseCycleCheck);
+
+ Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs);
+
+ void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+ unsigned IdxOffset, std::vector<Constant*> &Mask);
+
+ Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+ Instruction *J);
+
+ Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs);
+
+ Value* CommonShuffleSource(Instruction *I, Instruction *J);
+
+ void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+ bool FlipMemInputs);
+
+ void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt, Instruction *&K1,
+ Instruction *&K2, bool FlipMemInputs);
+
+ void collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I);
+
+ void collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet);
+
+ void moveUsesOfIAfterJ(BasicBlock &BB,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J);
+
+ void collectPtrInfo(std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<Value *> &LowPtrInsts);
+
+ bool doInitialization(Module& /*m*/) {
+ return false;
+ }
+ bool doFinalization(Module& /*m*/) {
+ return false;
+ }
+ virtual bool runOnFunction(Function &Func) {
+
+ AA = &getAnalysis<AliasAnalysis>();
+ SE = &getAnalysis<ScalarEvolution>();
+#ifdef LLVM_3_1
+ TD = getAnalysisIfAvailable<TargetData>();
+#elif defined LLVM_3_2
+ TD = getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
+#else
+ TD = getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI;
+#endif
+
+ bool changed = false;
+ for (Function::iterator i = Func.begin();
+ i != Func.end(); i++) {
+ changed |=runOnBasicBlock(*i);
+ }
+ return changed;
+ }
+
+ virtual bool runOnBasicBlock(BasicBlock &BB) {
+
+ bool changed = false;
+
+ // First try to create vectors of all allocas, if there are any
+ changed |= vectorizeAllocas(BB);
+ // Iterate a sufficient number of times to merge types of size 1 bit,
+ // then 2 bits, then 4, etc. up to half of the target vector width of the
+ // target vector register.
+ bool vectorizeTwice = false;
+
+
+ // There are 3 possible cases of vectorization in regards to memory
+ // operations:
+ // 1: Explicitly forbid vectorization of mem ops (NoMemOps)
+ // 2: Allow only vectorization of mem ops (MemOpsOnly)
+ // 3: Vectorize mem ops as well as everything else
+ // In cases 1 and 2, following test makes sure vectorization is
+ // run only once.
+ // For case 3, we first run vectorization of memory operations only
+ // and then we run vectorization of everything else. In between
+ // we remove unused operations, which are typicaly memory
+ // access computations that are not needed anymore and their vectorization
+ // is waste of resources. Instruction combiner is not able to get rid
+ // of those on it's own once they are in vectors.
+
+ // Store original values of two variables. They can be changed bellow
+ // but have to be restored before calling this for next BB.
+ bool originalMemOpsOnly = MemOpsOnly;
+ bool originalNoMemOps = NoMemOps;
+ if (!MemOpsOnly && !NoMemOps) {
+ MemOpsOnly = true;
+ vectorizeTwice = true;
+ }
+#if 0
+#ifdef LLVM_3_3
+ if (TTI) {
+ std::cerr << " settign new vector width" << std::endl;
+ unsigned WidestRegister = TTI->getRegisterBitWidth(true);
+ VectorWidth = WidestRegister/32;
+ std::cerr << VectorWidth << std::endl;
+ }
+#endif
+#endif
+
+ for (unsigned v = 2, n = 1; v <= VectorWidth;
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "WIV: fusing memm only in loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB)) {
+ dropUnused(BB);
+ changed = true;
+ }
+ else
+ break;
+ }
+ if (vectorizeTwice) {
+ MemOpsOnly = false;
+ NoMemOps = true;
+ for (unsigned v = 2, n = 1; v <= VectorWidth;
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "WIV: fusing loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB)) {
+ dropUnused(BB);
+ changed = true;
+ }
+ else
+ break;
+ }
+ }
+
+ if (changed) {
+ vectorizePhiNodes(BB);
+ removeDuplicates(BB);
+ }
+
+ DEBUG(dbgs() << "WIV: done!\n");
+ MemOpsOnly = originalMemOpsOnly;
+ NoMemOps = originalNoMemOps;
+ return changed;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AliasAnalysis>();
+ AU.addRequired<ScalarEvolution>();
+ AU.addPreserved<AliasAnalysis>();
+ AU.addPreserved<ScalarEvolution>();
+ AU.setPreservesCFG();
+ }
+ // This returns the vector type that holds a pair of the provided type.
+ // If the provided type is already a vector, then its length is doubled.
+ static inline VectorType *getVecTypeForVector(Type *ElemTy) {
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ unsigned numElem = VTy->getNumElements();
+ return VectorType::get(ElemTy->getScalarType(), numElem*VectorWidth);
+ } else {
+ return VectorType::get(ElemTy->getScalarType(), VectorWidth);
+
+ }
+
+ return VectorType::get(ElemTy, 2);
+ }
+ // This returns the vector type that holds a pair of the provided type.
+ // If the provided type is already a vector, then its length is doubled.
+ static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+ assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+ "Cannot form vector from incompatible scalar types");
+ Type *STy = ElemTy->getScalarType();
+
+ unsigned numElem;
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ numElem = VTy->getNumElements();
+ } else {
+ numElem = 1;
+ }
+
+ if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+ numElem += VTy->getNumElements();
+ } else {
+ numElem += 1;
+ }
+
+ return VectorType::get(STy, numElem);
+ }
+
+ std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+ unsigned n = 0) {
+ if (!I->hasName())
+ return "";
+
+ return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+ (n > 0 ? "." + utostr(n) : "")).str();
+ }
+
+ // Returns the weight associated with the provided value. A chain of
+ // candidate pairs has a length given by the sum of the weights of its
+ // members (one weight per pair; the weight of each member of the pair
+ // is assumed to be the same). This length is then compared to the
+ // chain-length threshold to determine if a given chain is significant
+ // enough to be vectorized. The length is also used in comparing
+ // candidate chains where longer chains are considered to be better.
+ // Note: when this function returns 0, the resulting instructions are
+ // not actually fused.
+ static inline size_t getDepthFactor(Value *V) {
+ // InsertElement and ExtractElement have a depth factor of zero. This is
+ // for two reasons: First, they cannot be usefully fused. Second, because
+ // the pass generates a lot of these, they can confuse the simple metric
+ // used to compare the trees in the next iteration. Thus, giving them a
+ // weight of zero allows the pass to essentially ignore them in
+ // subsequent iterations when looking for vectorization opportunities
+ // while still tracking dependency chains that flow through those
+ // instructions.
+ if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+ return 0;
+
+ // Give a load or store half of the required depth so that load/store
+ // pairs will vectorize.
+ if ((isa<LoadInst>(V) || isa<StoreInst>(V)))
+ return ReqChainDepth;
+
+ return 1;
+ }
+ // Returns the cost of the provided instruction using VTTI.
+ // This does not handle loads and stores.
+ unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+#ifdef LLVM_3_1
+ return 1;
+#else
+ switch (Opcode) {
+ default: break;
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because scalar GEPs are usually
+ // lowered to the intruction addressing mode. At the moment we don't
+ // generate vector GEPs.
+ return 0;
+ case Instruction::Br:
+ return VTTI->getCFInstrCost(Opcode);
+ case Instruction::PHI:
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return VTTI->getArithmeticInstrCost(Opcode, T1);
+ case Instruction::Select:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ return VTTI->getCastInstrCost(Opcode, T1, T2);
+ }
+ return 1;
+#endif
+ }
+ // This determines the relative offset of two loads or stores, returning
+ // true if the offset could be determined to be some constant value.
+ // For example, if OffsetInElmts == 1, then J accesses the memory directly
+ // after I; if OffsetInElmts == -1 then I accesses the memory
+ // directly after J. This function assumes that both instructions
+ // have the same type.
+ bool getPairPtrInfo(Instruction *I, Instruction *J,
+ Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+ unsigned &IAddressSpace, unsigned &JAddressSpace,
+ int64_t &OffsetInElmts) {
+ OffsetInElmts = 0;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LoadInst *LJ = cast<LoadInst>(J);
+ IPtr = LI->getPointerOperand();
+ JPtr = LJ->getPointerOperand();
+ IAlignment = LI->getAlignment();
+ JAlignment = LJ->getAlignment();
+ IAddressSpace = LI->getPointerAddressSpace();
+ JAddressSpace = LJ->getPointerAddressSpace();
+ } else if (isa<GetElementPtrInst>(I)) {
+ Instruction::op_iterator it = cast<GetElementPtrInst>(I)->idx_begin();
+ IPtr = *it;
+ Instruction::op_iterator jt = cast<GetElementPtrInst>(J)->idx_begin();
+ JPtr = *jt;
+ if (!IPtr || !JPtr)
+ return false;
+ IAlignment = 0;
+ JAlignment = 0;
+ } else {
+ StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+ IPtr = SI->getPointerOperand();
+ JPtr = SJ->getPointerOperand();
+ IAlignment = SI->getAlignment();
+ JAlignment = SJ->getAlignment();
+ IAddressSpace = SI->getPointerAddressSpace();
+ JAddressSpace = SJ->getPointerAddressSpace();
+ }
+ if ((isa<GetElementPtrInst>(I) && !SE->isSCEVable(IPtr->getType()))
+ || (isa<GetElementPtrInst>(J) && !SE->isSCEVable(JPtr->getType()))) {
+ // Asume, the getelementpointer is already vector, so the pointer
+ // operand is also the vector and LLVM scalar evaluation can
+ // not understand it.
+ OffsetInElmts = 2;
+ return true;
+ }
+ const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+ const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+ // If this is a trivial offset, then we'll get something like
+ // 1*sizeof(type). With target data, which we need anyway, this will get
+ // constant folded into a number.
+ const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+ if (const SCEVConstant *ConstOffSCEV =
+ dyn_cast<SCEVConstant>(OffsetSCEV)) {
+ ConstantInt *IntOff = ConstOffSCEV->getValue();
+ int64_t Offset = IntOff->getSExtValue();
+ if (isa<GetElementPtrInst>(I)) {
+ OffsetInElmts = Offset;
+ return (abs64(Offset)) > 1;
+ }
+ Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+ int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+ Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+ if (VTy != VTy2 && Offset < 0) {
+ int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+ OffsetInElmts = Offset/VTy2TSS;
+ return (abs64(Offset) % VTy2TSS) == 0;
+ }
+ OffsetInElmts = Offset/VTyTSS;
+
+ return (abs64(Offset) % VTyTSS) == 0;
+ }
+ return false;
+ }
+
+ // Returns true if the provided CallInst represents an intrinsic that can
+ // be vectorized.
+ bool isVectorizableIntrinsic(CallInst* I) {
+ Function *F = I->getCalledFunction();
+ if (!F) return false;
+
+ unsigned IID = F->getIntrinsicID();
+ if (!IID) return false;
+
+ switch(IID) {
+ default:
+ return false;
+ case Intrinsic::sqrt:
+ case Intrinsic::powi:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ return !NoMath;
+ case Intrinsic::fma:
+ return !NoFMA;
+ }
+ }
+
+ // Returns true if J is the second element in some pair referenced by
+ // some multimap pair iterator pair.
+ template <typename V>
+ bool isSecondInIteratorPair(V J, std::pair<
+ typename std::multimap<V, V>::iterator,
+ typename std::multimap<V, V>::iterator> PairRange) {
+ for (typename std::multimap<V, V>::iterator K = PairRange.first;
+ K != PairRange.second; ++K)
+ if (K->second == J) return true;
+
+ return false;
+ }
+ };
+ // In some cases, instructions did not get combined correctly by previous passes.
+ // For example with large number of replicated work items, scalar load of constant
+ // happened for first work item and then exactly same load in 15 and 30th work item.
+ // The work items in between reused the previous value.
+ // Also, the vectorization vectorization leads to situations where scalar value
+ // needs to be replicated to create vector, however, separate vectors were
+ // created each time the value was to be used.
+ // This fixes that by search for exactly same Instructions, with same type
+ // and exactly same parameters and removing later one of them, replacing
+ // all uses with former.
+ bool WIVectorize::removeDuplicates(BasicBlock &BB) {
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+ BasicBlock::iterator End = BB.end();
+ for (BasicBlock::iterator I = Start; I != End; ++I) {
+ BasicBlock::iterator J = llvm::next(I);
+
+ for ( ; J != End; ) {
+
+ if (isa<AllocaInst>(I) || !I->isIdenticalTo(J)) {
+ J = llvm::next(J);
+ continue;
+ } else {
+ J->replaceAllUsesWith(I);
+ AA->replaceWithNewValue(J, I);
+ SE->forgetValue(J);
+ BasicBlock::iterator K = llvm::next(J);
+ J->eraseFromParent();
+ J = K;
+ }
+ }
+ }
+
+ return false;
+ }
+ // Replace phi nodes of individual valiables with vector they originated
+ // from.
+ bool WIVectorize::vectorizePhiNodes(BasicBlock &BB) {
+ BasicBlock::iterator Start = BB.begin();
+ BasicBlock::iterator End = BB.getFirstInsertionPt();
+
+ ValueVectorMap valueMap;
+ for (BasicBlock::iterator I = Start; I != End; ++I) {
+ PHINode* node = dyn_cast<PHINode>(I);
+ if (node) {
+ ValueVector* candidateVector = new ValueVector;
+ for (BasicBlock::iterator J = llvm::next(I);
+ J != End; ++J) {
+ PHINode* node2 = dyn_cast<PHINode>(J);
+ if (node2) {
+ bool match = true;
+ if (node->getNumIncomingValues() !=
+ node2->getNumIncomingValues())
+ continue;
+
+ for (unsigned int i = 0;
+ i < node->getNumIncomingValues(); i++) {
+ Value* v1 = node->getIncomingValue(i);
+ Value* v2 = node2->getIncomingValue(i);
+ if (node->getIncomingBlock(i) !=
+ node2->getIncomingBlock(i)) {
+ match = false;
+ }
+ // Stored sources contain original value from
+ // which one in phi node was extracted from
+ DenseMap<Value*, Value*>::iterator vi =
+ storedSources.find(v1);
+ if (vi != storedSources.end()) {
+ DenseMap<Value*, Value*>::iterator ji =
+ storedSources.find(v2);
+ if (ji != storedSources.end() &&
+ (*vi).second == (*ji).second) {
+ } else {
+ match = false;
+ }
+ } else {
+ // Incaming value can be also constant, they
+ // have to match.
+ Constant* const1 = dyn_cast<Constant>(v1);
+ Constant* const2 = dyn_cast<Constant>(v2);
+ if (!(const1 && const2)) /* &&
+ const1->getValue() == const2->getValue())) */{
+ match = false;
+ }
+ }
+ }
+ if (match)
+ candidateVector->push_back(node2);
+ }
+ }
+ if (candidateVector->size() == VectorWidth -1) {
+ Value* newV = cast<Value>(node);
+ valueMap[newV] = candidateVector;
+ }
+ }
+ }
+ // Actually create new phi node
+ for (DenseMap<Value*, ValueVector*>::iterator i =
+ valueMap.begin(); i != valueMap.end(); i++) {
+ ValueVector& v = *(*i).second;
+ PHINode* orig = cast<PHINode>((*i).first);
+ Type *IType = orig->getType();
+ Type *VType = getVecTypeForVector(IType);
+ PHINode* phi = PHINode::Create(VType, orig->getNumIncomingValues(),
+ getReplacementName(orig, false,0), orig);
+ // Add incoming pairs to the phi node.
+ for (unsigned int i = 0; i < orig->getNumIncomingValues(); i++) {
+ Value* inc = orig->getIncomingValue(i);
+ BasicBlock* BB = orig->getIncomingBlock(i);
+ DenseMap<Value*, Value*>::iterator iter =
+ storedSources.find(inc);
+ if (iter != storedSources.end()) {
+ phi->addIncoming((*iter).second, BB);
+ } else {
+ Constant* origConst = cast<Constant>(inc);
+ Constant* cons = ConstantVector::getSplat(
+ VectorWidth, origConst);
+ phi->addIncoming(cons, BB);
+ }
+ }
+ // Extract scalar values from phi node to be used in the body
+ // of basic block. Replacing their uses cause instruction combiner
+ // to find extractlement -> insertelement pairs and drop them
+ // leaving direct use of vector.
+ LLVMContext& Context = BB.getContext();
+ BasicBlock::iterator toFill = BB.getFirstInsertionPt();
+ int index = 0;
+
+ // Find from the user of original phi node in which position it
+ // is inserted to the vector before being used by vector instruction.
+ // We have to extract it from same position of the vector phi node.
+ Instruction::use_iterator useiter = orig->use_begin();
+ while (useiter != orig->use_end()) {
+ llvm::User* tmp = *useiter;
+ if (isa<InsertElementInst>(tmp)) {
+ Value* in = tmp->getOperand(2);
+ if (isa<ConstantInt>(in)) {
+ index =
+ cast<ConstantInt>(in)->getZExtValue();
+ break;
+ }
+ }
+ useiter++;
+ }
+
+ //}
+ Value *X = ConstantInt::get(Type::getInt32Ty(Context), index);
+ Instruction* other = ExtractElementInst::Create(phi, X,
+ getReplacementName(phi, false, 0));
+ other->insertAfter(toFill);
+ orig->replaceAllUsesWith(other);
+ AA->replaceWithNewValue(orig, other);
+ SE->forgetValue(orig);
+ orig->eraseFromParent();
+ Instruction* ins = other;
+ for (unsigned int i = 0; i < v.size(); i++) {
+ Instruction* tmp = cast<Instruction>(v[i]);
+ // Find from the user of original phi node in which position it
+ // is inserted to the vector before being used by vector instruction.
+ // We have to extract it from same position of the vector phi node.
+ Instruction::use_iterator ui = tmp->use_begin();
+ while (ui != tmp->use_end()) {
+ llvm::User* user = *ui;
+ if (isa<InsertElementInst>(user)) {
+ Value* in = user->getOperand(2);
+ if (isa<ConstantInt>(in)) {
+ index =
+ cast<ConstantInt>(in)->getZExtValue();
+ break;
+ }
+ }
+ ui++;
+ }
+ X = ConstantInt::get(Type::getInt32Ty(Context), index);
+ Instruction* other = ExtractElementInst::Create(phi, X,
+ getReplacementName(phi, false, index));
+ other->insertAfter(ins);
+
+ tmp->replaceAllUsesWith(other);
+ AA->replaceWithNewValue(tmp, other);
+ SE->forgetValue(tmp);
+ tmp->eraseFromParent();
+ ins = other;
+ }
+
+ }
+ return true;
+ }
+ // This function implements one vectorization iteration on the provided
+ // basic block. It returns true if the block is changed.
+ bool WIVectorize::vectorizePairs(BasicBlock &BB) {
+ bool ShouldContinue;
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+ std::vector<Value *> AllPairableInsts;
+ DenseMap<Value *, Value *> AllChosenPairs;
+
+ std::vector<Value *> PairableInsts;
+ std::multimap<Value *, Value *> CandidatePairs;
+ ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+ PairableInsts);
+ if (PairableInsts.empty()) return false;
+ // Now we have a map of all of the pairable instructions and we need to
+ // select the best possible pairing. A good pairing is one such that the
+ // users of the pair are also paired. This defines a (directed) forest
+ // over the pairs such that two pairs are connected iff the second pair
+ // uses the first.
+
+ // Note that it only matters that both members of the second pair use some
+ // element of the first pair (to allow for splatting).
+
+ std::multimap<ValuePair, ValuePair> ConnectedPairs;
+ computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+
+ // Build the pairable-instruction dependency map
+ DenseSet<ValuePair> PairableInstUsers;
+ buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+ // There is now a graph of the connected pairs. For each variable, pick
+ // the pairing with the largest tree meeting the depth requirement on at
+ // least one branch. Then select all pairings that are part of that tree
+ // and remove them from the list of available pairings and pairable
+ // variables.
+
+ DenseMap<Value *, Value *> ChosenPairs;
+ choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs);
+
+ if (ChosenPairs.empty())
+ return false;
+
+ AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+ PairableInsts.end());
+ AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+ if (AllChosenPairs.empty()) return false;
+ NumFusedOps += AllChosenPairs.size();
+
+ // A set of pairs has now been selected. It is now necessary to replace the
+ // paired instructions with vector instructions. For this procedure each
+ // operand must be replaced with a vector operand. This vector is formed
+ // by using build_vector on the old operands. The replaced values are then
+ // replaced with a vector_extract on the result. Subsequent optimization
+ // passes should coalesce the build/extract combinations.
+
+ fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+
+ return true;
+ }
+
+ // This function returns true if the provided instruction is capable of being
+ // fused into a vector instruction. This determination is based only on the
+ // type and other attributes of the instruction.
+ bool WIVectorize::isInstVectorizable(Instruction *I,
+ bool &IsSimpleLoadStore) {
+ IsSimpleLoadStore = false;
+
+ if (MemOpsOnly &&
+ !(isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I)))
+ return false;
+
+ if (CallInst *C = dyn_cast<CallInst>(I)) {
+ if (!isVectorizableIntrinsic(C)) {
+ return false;
+
+ }
+ } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Vectorize simple loads if possbile:
+ IsSimpleLoadStore = L->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps) {
+ return false;
+ }
+ } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+ // Vectorize simple stores if possbile:
+ IsSimpleLoadStore = S->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps) {
+ return false;
+ }
+ } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+ // We can vectorize casts, but not casts of pointer types, etc.
+
+ Type *SrcTy = C->getSrcTy();
+ if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) {
+ return false;
+ }
+ Type *DestTy = C->getDestTy();
+ if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) {
+ return false;
+ }
+ } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
+ // Currently, vector GEPs exist only with one index.
+ if (G->getNumIndices() != 1 || NoMemOps || NoGEP)
+ return false;
+ } else if (isa<CmpInst>(I)) {
+ if (NoCMP)
+ return false;
+ } else if (!(I->isBinaryOp())){ /*|| isa<ShuffleVectorInst>(I) ||
+ isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {*/
+ return false;
+ }
+ // We can't vectorize memory operations without target data
+ if (TD == 0 && IsSimpleLoadStore)
+ return false;
+
+ Type *T1, *T2;
+ if (isa<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = cast<StoreInst>(I)->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (I->isCast())
+ T2 = cast<CastInst>(I)->getSrcTy();
+ else
+ T2 = T1;
+
+ // Not every type can be vectorized...
+ if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+ !(VectorType::isValidElementType(T2) || T2->isVectorTy())) {
+ return false;
+ }
+ if ((T1->getPrimitiveSizeInBits() > (VectorWidth*32)/2 ||
+ T2->getPrimitiveSizeInBits() > (VectorWidth*32)/2)) {
+ return false;
+ }
+
+ // Floating point vectorization can be dissabled
+ if (I->getType()->isFloatingPointTy() && NoFP)
+ return false;
+
+ // Do not vectorizer pointer types. Currently do not work with LLVM 3.1.
+ if (!isa<GetElementPtrInst>(I) &&
+ (T1->getScalarType()->isPointerTy() ||
+ T2->getScalarType()->isPointerTy()))
+ return false;
+ // Check if the instruction can be loop counter, we do not vectorize those
+ // since they have to be same for all work items we are vectorizing
+ // and computations of load/store indexes usually depenends on them.
+ // Instruction combiner pass will remove duplicates.
+ if (SE->isSCEVable(I->getType())) {
+ const SCEV* sc = SE->getSCEV(I);
+ if (const SCEVAddRecExpr* S = dyn_cast<SCEVAddRecExpr>(sc)) {
+ if (I->hasNUses(2)) {
+ // Loop counter instruction is used in the comparison
+ // operation before branch and with the phi node.
+ // Any more uses indicates that the instruction is also
+ // used as part of some computation and possibly needs
+ // to get vectorize.
+ bool compare = false;
+ bool phi = false;
+ for (Value::use_iterator it = I->use_begin();
+ it != I->use_end();
+ it++) {
+ if (isa<CmpInst>(*it))
+ compare = true;
+ if (isa<PHINode>(*it))
+ phi = true;
+ }
+ if (compare && phi)
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use tree of I.
+ bool WIVectorize::areInstsCompatibleFromDifferentWi(Instruction *I,
+ Instruction *J) {
+
+ if (I->getMetadata("wi") == NULL || J->getMetadata("wi") == NULL) {
+ return false;
+ }
+ if (MemOpsOnly &&
+ !((isa<LoadInst>(I) && isa<LoadInst>(J)) ||
+ (isa<StoreInst>(I) && isa<StoreInst>(J)) ||
+ (isa<GetElementPtrInst>(I) && isa<GetElementPtrInst>(J)))) {
+ return false;
+ }
+ MDNode* mi = I->getMetadata("wi");
+ MDNode* mj = J->getMetadata("wi");
+ assert(mi->getNumOperands() == 3);
+ assert(mj->getNumOperands() == 3);
+
+ // Second operand of MDNode contains MDNode with XYZ tripplet.
+ MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2));
+ MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+ assert(jXYZ->getNumOperands() == 4);
+
+ ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1));
+ ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1));
+
+ ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2));
+ ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2));
+
+ ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3));
+ ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3));
+
+ if ( CIX->getValue() == CJX->getValue()
+ && CIY->getValue() == CJY->getValue()
+ && CIZ->getValue() == CJZ->getValue()) {
+ // Same work item, no vectorizing
+ return false;
+ }
+ mi = I->getMetadata("wi_counter");
+ mj = J->getMetadata("wi_counter");
+
+ ConstantInt *CI = dyn_cast<ConstantInt>(mi->getOperand(1));
+ ConstantInt *CJ = dyn_cast<ConstantInt>(mj->getOperand(1));
+ if (CI->getValue() != CJ->getValue()) {
+ // different line in the original work item
+ // we do not want to vectorize operations that do not match
+ return false;
+ }
+ return true;
+ }
+ static inline void getInstructionTypes(Instruction *I,
+ Type *&T1, Type *&T2) {
+ if (isa<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = cast<StoreInst>(I)->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (I->isCast())
+ T2 = cast<CastInst>(I)->getSrcTy();
+ else
+ T2 = T1;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ T2 = SI->getCondition()->getType();
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ T2 = SI->getOperand(0)->getType();
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+ T2 = CI->getOperand(0)->getType();
+ }
+ }
+
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use tree of I.
+ bool WIVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore) {
+ DEBUG( if (DebugInstructionExamination) dbgs() << "WIV: looking at " << *I <<
+ " <-> " << *J << "\n");
+
+ // Loads and stores can be merged if they have different alignments,
+ // but are otherwise the same.
+ LoadInst *LI, *LJ;
+ StoreInst *SI, *SJ;
+ if (!J->isSameOperationAs(I)) {
+ return false;
+ }
+ Type *IT1, *IT2, *JT1, *JT2;
+ getInstructionTypes(I, IT1, IT2);
+ getInstructionTypes(J, JT1, JT2);
+
+ if (IsSimpleLoadStore || isa<GetElementPtrInst>(I)) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts = 0;
+ bool foundPointer =
+ getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace, OffsetInElmts);
+ if ( foundPointer && abs64(OffsetInElmts) == 1) {
+ Type *aTypeI = isa<StoreInst>(I) ?
+ cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+ Type *aTypeJ = isa<StoreInst>(J) ?
+ cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+ Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
+ // An aligned load or store is possible only if the instruction
+ // with the lower offset has an alignment suitable for the
+ // vector type.
+
+ unsigned BottomAlignment = IAlignment;
+ if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+ unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+ if (AlignedOnly) {
+ if (BottomAlignment < VecAlignment) {
+ return false;
+ }
+ }
+#ifndef LLVM_3_1
+ if (VTTI) {
+ unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+ IAlignment, IAddressSpace);
+ unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
+ JAlignment, JAddressSpace);
+ unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
+ BottomAlignment,
+ IAddressSpace);
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts = VTTI->getNumberOfParts(VType);
+ if (VParts > 1)
+ return false;
+ else if (!VParts && VCost == ICost + JCost)
+ return false;
+
+ }
+#endif
+ } else if(foundPointer && abs64(OffsetInElmts)>1){
+ if (isa<GetElementPtrInst>(I)) {
+ return true;
+ }
+ // Collect information on memory accesses with stride.
+ // This is not usefull for anything, just to analyze code a bit.
+ if (I->getMetadata("wi") != NULL) {
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI =
+ cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI =
+ cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+ std::pair<int, int> index = std::pair<int,int>(RI,CI);
+ DenseMap<std::pair<int,int>, ValueVector*>::iterator it =
+ stridedOps.find(index);
+ ValueVector* v = NULL;
+ if (it != stridedOps.end()) {
+ v = (*it).second;
+ } else {
+ v = new ValueVector;
+ }
+ v->push_back(I);
+ v->push_back(J);
+ stridedOps.insert(
+ std::pair< std::pair<int, int>, ValueVector*>(index, v));
+ }
+ return false;
+ } else {
+ return false;
+ }
+ } else if (isa<ShuffleVectorInst>(I)) {
+ // Only merge two shuffles if they're both constant
+ return isa<Constant>(I->getOperand(2)) &&
+ isa<Constant>(J->getOperand(2));
+ // FIXME: We may want to vectorize non-constant shuffles also.
+#ifdef LLVM_3_1
+ }
+#else
+ } else if (VTTI) {
+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+ Type *VT1 = getVecTypeForPair(IT1, JT1),
+ *VT2 = getVecTypeForPair(IT2, JT2);
+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+
+ if (VCost > ICost + JCost) {
+ return false;
+ }
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts1 = VTTI->getNumberOfParts(VT1),
+ VParts2 = VTTI->getNumberOfParts(VT2);
+ if (VParts1 > 1 || VParts2 > 1)
+ return false;
+ else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+ return false;
+
+ //CostSavings = ICost + JCost - VCost;
+ }
+#endif
+ // The powi intrinsic is special because only the first argument is
+ // vectorized, the second arguments must be equal.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ Function *FI;
+ if (CI && (FI = CI->getCalledFunction()) &&
+ FI->getIntrinsicID() == Intrinsic::powi) {
+
+ Value *A1I = CI->getArgOperand(1),
+ *A1J = cast<CallInst>(J)->getArgOperand(1);
+ const SCEV *A1ISCEV = SE->getSCEV(A1I),
+ *A1JSCEV = SE->getSCEV(A1J);
+ return (A1ISCEV == A1JSCEV);
+ }
+ return true;
+ }
+
+ // Figure out whether or not J uses I and update the users and write-set
+ // structures associated with I. Specifically, Users represents the set of
+ // instructions that depend on I. WriteSet represents the set
+ // of memory locations that are dependent on I. If UpdateUsers is true,
+ // and J uses I, then Users is updated to contain J and WriteSet is updated
+ // to contain any memory locations to which J writes. The function returns
+ // true if J uses I. By default, alias analysis is used to determine
+ // whether J reads from memory that overlaps with a location in WriteSet.
+ // If LoadMoveSet is not null, then it is a previously-computed multimap
+ // where the key is the memory-based user instruction and the value is
+ // the instruction to be compared with I. So, if LoadMoveSet is provided,
+ // then the alias analysis is not used. This is necessary because this
+ // function is called during the process of moving instructions during
+ // vectorization and the results of the alias analysis are not stable during
+ // that process.
+ bool WIVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers,
+ std::multimap<Value *, Value *> *LoadMoveSet) {
+ bool UsesI = false;
+
+ // This instruction may already be marked as a user due, for example, to
+ // being a member of a selected pair.
+ if (Users.count(J))
+ UsesI = true;
+
+ if (!UsesI)
+ for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
+ JU != JE; ++JU) {
+ Value *V = *JU;
+ if (I == V || Users.count(V)) {
+ UsesI = true;
+ break;
+ }
+ }
+ if (!UsesI && J->mayReadFromMemory()) {
+ if (LoadMoveSet) {
+ VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
+ UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+ }
+ }
+
+ if (UsesI && UpdateUsers) {
+ if (J->mayWriteToMemory()) WriteSet.add(J);
+ Users.insert(J);
+ }
+
+ return UsesI;
+ }
+
+ // This function iterates over all instruction pairs in the provided
+ // basic block and collects all candidate pairs for vectorization.
+ bool WIVectorize::getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts) {
+ BasicBlock::iterator E = BB.end();
+ LLVMContext& context = BB.getContext();
+
+ if (Start == E) return false;
+
+ std::multimap<int, ValueVector*> temporary;
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+
+ if (I->getMetadata("wi") == NULL)
+ continue;
+ bool IsSimpleLoadStore;
+ if (!isInstVectorizable(I, IsSimpleLoadStore)) {
+ continue;
+ }
+
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+
+ std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI);
+ std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI);
+ ValueVector* tmpVec = NULL;
+ while(itb != ite) {
+ if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) {
+ // Test also if instructions are from same region.
+ MDNode* tmpMD =
+ cast<Instruction>((*(*itb).second)[0])->getMetadata("wi");
+ MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1));
+ unsigned tmpRI =
+ cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue();
+ if (RI == tmpRI)
+ tmpVec = (*itb).second;
+ }
+ itb++;
+ }
+ if (tmpVec == NULL) {
+ tmpVec = new ValueVector;
+ temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec));
+ }
+ tmpVec->push_back(I);
+ }
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin();
+ insIt != temporary.end(); insIt++) {
+ ValueVector* tmpVec = (*insIt).second;
+ // Prevent creation of vectors shorter then the vector width in case
+ // vectorization of asymetric counters is disabled.
+ if (tmpVec->size() % 2 != 0 && NoCount) {
+ continue;
+ }
+
+ if (tmpVec->size() % 2 != 0 && !MemOpsOnly) {
+
+ // Ok, this is extremely ugly, however this code is specific for
+ // for situation where the base address of some array is computed
+ // one way and the addresses for the rest of the work items are
+ // computed other way. E.g.
+ // id_0 = x*y*z
+ // id_1 = id_0 + const
+ // id_2 = id_0 + const + const
+ // ...
+ // Therefore only applicable to add operation.
+ // It should bring some performance improvements when targetting TTA.
+
+ // NOTE: results are opposide of what is expected.
+ // With NoCount set to true, the vectorization of loop counter arithmetic
+ // operations is actually prevented. The ProgramPartitioner is assigning
+ // them to the lanes. This seems to provide better performance.
+ // With NoCount set to false, the vectorization of loop counter
+ // arithmetic is allowed, creating better bitcode, but when mapped
+ // to TTA, performance is much worse.
+
+ Instruction* tmp = cast<Instruction>((*tmpVec)[0]);
+ if ( !(tmpVec->size() == 1 ||
+ tmp->getType()->isVectorTy() ||
+ tmp->getOpcode() != Instruction::Add)) {
+
+ bool identity = false;
+ bool argumentOperand = false;
+ // If none of the arguments to add is constant
+ // we do not replace it with identity, neither if operand
+ // is function argument since that can be used in different
+ // blocks.
+ for (unsigned o = 0; o < tmp->getNumOperands(); ++o) {
+ if (isa<ConstantInt>(tmp->getOperand(o))) {
+ identity = true;
+ }
+ if (isa<Argument>(tmp->getOperand(o))) {
+ argumentOperand = true;
+ }
+ }
+ if (!identity || argumentOperand)
+ continue;
+
+ Instruction* K = tmp->clone();
+ if ((*tmpVec)[0]->hasName()) {
+ std::string name = (*tmpVec)[0]->getName().str() + "_temp_0";
+ K->setName(name);
+ }
+
+ if (tmp->getMetadata("wi") != NULL) {
+ MDNode* md = tmp->getMetadata("wi");
+ MDNode* xyz = dyn_cast<MDNode>(md->getOperand(2));
+ MDNode* region = dyn_cast<MDNode>(md->getOperand(1));
+ ConstantInt *CIX =
+ dyn_cast<ConstantInt>(xyz->getOperand(1));
+ ConstantInt *CIY =
+ dyn_cast<ConstantInt>(xyz->getOperand(2));
+ ConstantInt *CIZ =
+ dyn_cast<ConstantInt>(xyz->getOperand(3));
+ if (CIX->getValue() == 1) {
+ Value *v2[] = {
+ MDString::get(context, "WI_xyz"),
+ ConstantInt::get(Type::getInt32Ty(context), 0),
+ CIY,
+ CIZ};
+ MDNode* newXYZ = MDNode::get(context, v2);
+ Value *v[] = {
+ MDString::get(context, "WI_data"),
+ region,
+ newXYZ};
+ MDNode* mdNew = MDNode::get(context, v);
+ K->setMetadata("wi", mdNew);
+ K->setMetadata("wi_counter", tmp->getMetadata("wi_counter"));
+ }
+ }
+ for (unsigned o = 0; o < K->getNumOperands(); ++o) {
+ if (isa<ConstantInt>(K->getOperand(o))) {
+ K->setOperand(o,
+ ConstantInt::get(K->getOperand(o)->getType(), 0));
+ }
+ }
+
+ Value* original = NULL;
+ for (unsigned o = 0; o < K->getNumOperands(); ++o) {
+ if (!isa<PHINode>(K->getOperand(o)) &&
+ isa<Instruction>(K->getOperand(o))) {
+ original = K->getOperand(o);
+ }
+ }
+ if (original != NULL) {
+ K->insertAfter(cast<Instruction>(original));
+ std::vector<User*> usesToReplace;
+ for (Value::use_iterator it = original->use_begin();
+ it != original->use_end();
+ it++) {
+ bool usedInVec = false;
+ if (*it != K) {
+ if (!NoCount) {
+ for (unsigned int j = 0; j < tmpVec->size(); j++) {
+ if ((*it) == (*tmpVec)[j]) {
+ usedInVec = true;
+ break;
+ }
+ }
+ }
+ if (!usedInVec) {
+ usesToReplace.push_back(*it);
+ }
+ }
+ }
+ for (unsigned int j = 0; j < usesToReplace.size(); j++) {
+ usesToReplace[j]->replaceUsesOfWith(original, K);
+ }
+ } else {
+ K->insertBefore(tmp);
+ }
+ tmpVec->insert(tmpVec->begin(), K);
+ }
+ }
+
+ // Create actual candidate pairs
+ for (unsigned j = 0; j < tmpVec->size()/2; j++) {
+ Instruction* I = cast<Instruction>((*tmpVec)[2*j]);
+ Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]);
+ if (!areInstsCompatibleFromDifferentWi(I,J)) continue;
+ bool IsSimpleLoadStore;
+
+ if (!isInstVectorizable(I, IsSimpleLoadStore)) {
+ break;
+ }
+
+ if (!areInstsCompatible(I, J, IsSimpleLoadStore)) {
+ break;
+ }
+
+ // Determine if J uses I, if so, exit the loop.
+ bool UsesI = trackUsesOfI(Users, WriteSet, I, J, true);
+ if (UsesI) {
+ break;
+ }
+
+ if (!PairableInsts.size() ||
+ PairableInsts[PairableInsts.size()-1] != I) {
+ PairableInsts.push_back(I);
+ }
+ CandidatePairs.insert(ValuePair(I, J));
+ }
+ }
+ return false;
+ }
+
+ // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+ // it looks for pairs such that both members have an input which is an
+ // output of PI or PJ.
+ void WIVectorize::computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P) {
+ StoreInst *SI, *SJ;
+ // For each possible pairing for this variable, look at the uses of
+ // the first value...
+ for (Value::use_iterator I = P.first->use_begin(),
+ E = P.first->use_end(); I != E; ++I) {
+ if (isa<LoadInst>(*I)) {
+ // A pair cannot be connected to a load because the load only takes one
+ // operand (the address) and it is a scalar even after vectorization.
+ continue;
+ } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+ P.first == SI->getPointerOperand()) {
+ // Similarly, a pair cannot be connected to a store through its
+ // pointer operand.
+ continue;
+ }
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ // For each use of the first variable, look for uses of the second
+ // variable...
+ for (Value::use_iterator J = P.second->use_begin(),
+ E2 = P.second->use_end(); J != E2; ++J) {
+
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
+
+ // Look for <I, J>:
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+
+ // Look for <J, I>:
+ if (isSecondInIteratorPair<Value*>(*I, JPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+ }
+ // Look for cases where just the first value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.first == SJ->getPointerOperand())
+ continue;
+
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+ // Look for cases where just the second value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator I = P.second->use_begin(),
+ E = P.second->use_end(); I != E; ++I) {
+ if (isa<LoadInst>(*I)) {
+ continue;
+ } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+ P.second == SI->getPointerOperand()) {
+ continue;
+ }
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+ }
+
+ // This function figures out which pairs are connected. Two pairs are
+ // connected if some output of the first pair forms an input to both members
+ // of the second pair.
+ void WIVectorize::computeConnectedPairs(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PE = PairableInsts.end(); PI != PE; ++PI) {
+ VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+
+ for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
+ P != choiceRange.second; ++P)
+ computePairsConnectedTo(CandidatePairs, PairableInsts,
+ ConnectedPairs, *P);
+ }
+
+ DEBUG(dbgs() << "WIV: found " << ConnectedPairs.size()
+ << " pair connections.\n");
+ }
+
+ // This function builds a set of use tuples such that <A, B> is in the set
+ // if B is in the use tree of A. If B is in the use tree of A, then B
+ // depends on the output of A.
+ void WIVectorize::buildDepMap(
+ BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ DenseSet<ValuePair> &PairableInstUsers) {
+ DenseSet<Value *> IsInPair;
+ for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
+ E = CandidatePairs.end(); C != E; ++C) {
+ IsInPair.insert(C->first);
+ IsInPair.insert(C->second);
+ }
+
+ // Iterate through the basic block, recording all Users of each
+ // pairable instruction.
+
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ if (IsInPair.find(I) == IsInPair.end()) continue;
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+ (void) trackUsesOfI(Users, WriteSet, I, J);
+
+ for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+ U != E; ++U)
+ PairableInstUsers.insert(ValuePair(I, *U));
+ }
+ }
+
+ // Returns true if an input to pair P is an output of pair Q and also an
+ // input of pair Q is an output of pair P. If this is the case, then these
+ // two pairs cannot be simultaneously fused.
+ bool WIVectorize::pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+
+ // Two pairs are in conflict if they are mutual Users of eachother.
+ bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.second));
+ bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.second));
+ if (PairableInstUserMap) {
+ // FIXME: The expensive part of the cycle check is not so much the cycle
+ // check itself but this edge insertion procedure. This needs some
+ // profiling and probably a different data structure (same is true of
+ // most uses of std::multimap).
+ if (PUsesQ) {
+ VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
+ if (!isSecondInIteratorPair(P, QPairRange))
+ PairableInstUserMap->insert(VPPair(Q, P));
+ }
+ if (QUsesP) {
+ VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
+ if (!isSecondInIteratorPair(Q, PPairRange))
+ PairableInstUserMap->insert(VPPair(P, Q));
+ }
+ }
+
+ return (QUsesP && PUsesQ);
+ }
+
+ // This function walks the use graph of current pairs to see if, starting
+ // from P, the walk returns to P.
+ bool WIVectorize::pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseSet<ValuePair> &CurrentPairs) {
+
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "WIV: starting cycle check for : " << *P.first << " <-> "
+ << *P.second << "\n");
+ // A lookup table of visisted pairs is kept because the PairableInstUserMap
+ // contains non-direct associations.
+ DenseSet<ValuePair> Visited;
+ SmallVector<ValuePair, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(P);
+ do {
+ ValuePair QTop = Q.pop_back_val();
+ Visited.insert(QTop);
+
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "WIV: cycle check visiting: " << *QTop.first << " <-> "
+ << *QTop.second << "\n");
+ VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
+ for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
+ C != QPairRange.second; ++C) {
+ if (C->second == P) {
+ DEBUG(dbgs()
+ << "WIV: rejected to prevent non-trivial cycle formation: "
+ << *C->first.first << " <-> " << *C->first.second << "\n");
+ return true;
+ }
+
+ if (CurrentPairs.count(C->second) && !Visited.count(C->second))
+ Q.push_back(C->second);
+ }
+ } while (!Q.empty());
+
+ return false;
+ }
+
+ // This function builds the initial tree of connected pairs with the
+ // pair J at the root.
+ void WIVectorize::buildInitialTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair>& /*PairableInstUsers*/,
+ DenseMap<Value *, Value *>& /*ChosenPairs*/,
+ DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
+ // Each of these pairs is viewed as the root node of a Tree. The Tree
+ // is then walked (depth-first). As this happens, we keep track of
+ // the pairs that compose the Tree and the maximum depth of the Tree.
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.back();
+
+ // Push each child onto the queue:
+ bool MoreChildren = false;
+ size_t MaxChildDepth = QTop.second;
+ VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
+ for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
+ k != qtRange.second; ++k) {
+ // Make sure that this child pair is still a candidate:
+ bool IsStillCand = false;
+ VPIteratorPair checkRange =
+ CandidatePairs.equal_range(k->second.first);
+ for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
+ m != checkRange.second; ++m) {
+ if (m->second == k->second.second) {
+ IsStillCand = true;
+ break;
+ }
+ }
+
+ if (IsStillCand) {
+ DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
+ if (C == Tree.end()) {
+ size_t d = getDepthFactor(k->second.first);
+ Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
+ MoreChildren = true;
+ } else {
+ MaxChildDepth = std::max(MaxChildDepth, C->second);
+ }
+ }
+ }
+
+ if (!MoreChildren) {
+ // Record the current pair as part of the Tree:
+ Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+ Q.pop_back();
+ }
+ } while (!Q.empty());
+ }
+
+ // Given some initial tree, prune it by removing conflicting pairs (pairs
+ // that cannot be simultaneously chosen for vectorization).
+ void WIVectorize::pruneTreeFor(
+ std::multimap<Value *, Value *> &/*CandidatePairs*/,
+ std::vector<Value *> &/*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree,
+ DenseSet<ValuePair> &PrunedTree, ValuePair J,
+ bool UseCycleCheck) {
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.pop_back_val();
+ PrunedTree.insert(QTop.first);
+
+ // Visit each child, pruning as necessary...
+ DenseMap<ValuePair, size_t> BestChildren;
+ VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
+ for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
+ K != QTopRange.second; ++K) {
+ DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
+ if (C == Tree.end()) continue;
+
+ // This child is in the Tree, now we need to make sure it is the
+ // best of any conflicting children. There could be multiple
+ // conflicting children, so first, determine if we're keeping
+ // this child, then delete conflicting children as necessary.
+
+ // It is also necessary to guard against pairing-induced
+ // dependencies. Consider instructions a .. x .. y .. b
+ // such that (a,b) are to be fused and (x,y) are to be fused
+ // but a is an input to x and b is an output from y. This
+ // means that y cannot be moved after b but x must be moved
+ // after b for (a,b) to be fused. In other words, after
+ // fusing (a,b) we have y .. a/b .. x where y is an input
+ // to a/b and x is an output to a/b: x and y can no longer
+ // be legally fused. To prevent this condition, we must
+ // make sure that a child pair added to the Tree is not
+ // both an input and output of an already-selected pair.
+
+ // Pairing-induced dependencies can also form from more complicated
+ // cycles. The pair vs. pair conflicts are easy to check, and so
+ // that is done explicitly for "fast rejection", and because for
+ // child vs. child conflicts, we may prefer to keep the current
+ // pair in preference to the already-selected child.
+ DenseSet<ValuePair> CurrentPairs;
+
+ bool CanAdd = true;
+ for (DenseMap<ValuePair, size_t>::iterator C2
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ if (C2->second >= C->second) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ }
+ if (!CanAdd) continue;
+
+ // Even worse, this child could conflict with another node already
+ // selected for the Tree. If that is the case, ignore this child.
+ for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
+ E2 = PrunedTree.end(); T != E2; ++T) {
+ if (T->first == C->first.first ||
+ T->first == C->first.second ||
+ T->second == C->first.first ||
+ T->second == C->first.second ||
+ pairsConflict(*T, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*T);
+ }
+ if (!CanAdd) continue;
+
+ // And check the queue too...
+ for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+ E2 = Q.end(); C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ if (!CanAdd) continue;
+
+ // Last but not least, check for a conflict with any of the
+ // already-chosen pairs.
+ for (DenseMap<Value *, Value *>::iterator C2 =
+ ChosenPairs.begin(), E2 = ChosenPairs.end();
+ C2 != E2; ++C2) {
+ if (pairsConflict(*C2, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*C2);
+ }
+ if (!CanAdd) continue;
+
+ // To check for non-trivial cycles formed by the addition of the
+ // current pair we've formed a list of all relevant pairs, now use a
+ // graph walk to check for a cycle. We start from the current pair and
+ // walk the use tree to see if we again reach the current pair. If we
+ // do, then the current pair is rejected.
+
+ // FIXME: It may be more efficient to use a topological-ordering
+ // algorithm to improve the cycle check. This should be investigated.
+ if (UseCycleCheck &&
+ pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+ continue;
+
+ // This child can be added, but we may have chosen it in preference
+ // to an already-selected child. Check for this here, and if a
+ // conflict is found, then remove the previously-selected child
+ // before adding this one in its place.
+ for (DenseMap<ValuePair, size_t>::iterator C2
+ = BestChildren.begin(); C2 != BestChildren.end();) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers))
+ BestChildren.erase(C2++);
+ else
+ ++C2;
+ }
+
+ BestChildren.insert(ValuePairWithDepth(C->first, C->second));
+ }
+
+ for (DenseMap<ValuePair, size_t>::iterator C
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C != E2; ++C) {
+ size_t DepthF = getDepthFactor(C->first.first);
+ Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+ }
+ } while (!Q.empty());
+ }
+
+ // This function finds the best tree of mututally-compatible connected
+ // pairs, given the choice of root pairs as an iterator range.
+ void WIVectorize::findBestTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+ size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ bool UseCycleCheck) {
+ for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
+ J != ChoiceRange.second; ++J) {
+
+ // Before going any further, make sure that this pair does not
+ // conflict with any already-selected pairs (see comment below
+ // near the Tree pruning for more details).
+ DenseSet<ValuePair> ChosenPairSet;
+ bool DoesConflict = false;
+ for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+ E = ChosenPairs.end(); C != E; ++C) {
+ if (pairsConflict(*C, *J, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ DoesConflict = true;
+ break;
+ }
+
+ ChosenPairSet.insert(*C);
+ }
+ if (DoesConflict) continue;
+
+ if (UseCycleCheck &&
+ pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+ continue;
+
+ DenseMap<ValuePair, size_t> Tree;
+ buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs, Tree, *J);
+
+ // Because we'll keep the child with the largest depth, the largest
+ // depth is still the same in the unpruned Tree.
+ size_t MaxDepth = Tree.lookup(*J);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "WIV: found Tree for pair {"
+ << *J->first << " <-> " << *J->second << "} of depth " <<
+ MaxDepth << " and size " << Tree.size() << "\n");
+
+ // At this point the Tree has been constructed, but, may contain
+ // contradictory children (meaning that different children of
+ // some tree node may be attempting to fuse the same instruction).
+ // So now we walk the tree again, in the case of a conflict,
+ // keep only the child with the largest depth. To break a tie,
+ // favor the first child.
+
+ DenseSet<ValuePair> PrunedTree;
+ pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
+ PrunedTree, *J, UseCycleCheck);
+
+ size_t EffSize = 0;
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S)
+ EffSize += getDepthFactor(S->first);
+
+ DEBUG(if (DebugPairSelection)
+ dbgs() << "WIV: found pruned Tree for pair {"
+ << *J->first << " <-> " << *J->second << "} of depth " <<
+ MaxDepth << " and size " << PrunedTree.size() <<
+ " (effective size: " << EffSize << ")\n");
+#if defined LLVM_3_1
+ if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
+#else
+ if ((VTTI || MaxDepth >= ReqChainDepth) && EffSize > BestEffSize) {
+#endif
+ BestMaxDepth = MaxDepth;
+ BestEffSize = EffSize;
+ BestTree = PrunedTree;
+ }
+ }
+ }
+
+ // Given the list of candidate pairs, this function selects those
+ // that will be fused into vector instructions.
+ void WIVectorize::choosePairs(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs) {
+ bool UseCycleCheck = true;
+ std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+ for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+ E = PairableInsts.end(); I != E; ++I) {
+ // The number of possible pairings for this variable:
+ size_t NumChoices = CandidatePairs.count(*I);
+ if (!NumChoices) continue;
+
+ VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+
+ // The best pair to choose and its tree:
+ size_t BestMaxDepth = 0, BestEffSize = 0;
+ DenseSet<ValuePair> BestTree;
+ findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, PairableInstUserMap, ChosenPairs,
+ BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+ UseCycleCheck);
+
+ // A tree has been chosen (or not) at this point. If no tree was
+ // chosen, then this instruction, I, cannot be paired (and is no longer
+ // considered).
+
+ DEBUG(if (BestTree.size() > 0)
+ dbgs() << "WIV: selected pairs in the best tree for: "
+ << *cast<Instruction>(*I) << "\n");
+
+ for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
+ SE2 = BestTree.end(); S != SE2; ++S) {
+ // Insert the members of this tree into the list of chosen pairs.
+ ChosenPairs.insert(ValuePair(S->first, S->second));
+ DEBUG(dbgs() << "WIV: selected pair: " << *S->first << " <-> " <<
+ *S->second << "\n");
+
+ // Remove all candidate pairs that have values in the chosen tree.
+ for (std::multimap<Value *, Value *>::iterator K =
+ CandidatePairs.begin(); K != CandidatePairs.end();) {
+ if (K->first == S->first || K->second == S->first ||
+ K->second == S->second || K->first == S->second) {
+ // Don't remove the actual pair chosen so that it can be used
+ // in subsequent tree selections.
+ if (!(K->first == S->first && K->second == S->second))
+ CandidatePairs.erase(K++);
+ else
+ ++K;
+ } else {
+ ++K;
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "WIV: selected " << ChosenPairs.size() << " pairs.\n");
+ }
+
+ // Returns the value that is to be used as the pointer input to the vector
+ // instruction that fuses I with J.
+ Value *WIVectorize::getReplacementPointerInput(LLVMContext& /*Context*/,
+ Instruction *I, Instruction *J, unsigned o,
+ bool FlipMemInputs) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts;
+
+ // Note: the analysis might fail here, that is why the pair order has
+ // been precomputed (OffsetInElmts must be unused here).
+ (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace,
+ OffsetInElmts);
+
+ // The pointer value is taken to be the one with the lowest offset.
+ Value *VPtr;
+ if (!FlipMemInputs) {
+ VPtr = IPtr;
+ } else {
+ FlipMemInputs = true;
+ VPtr = JPtr;
+ }
+
+ // If pointer source is another bitcast, go directly to original
+ // instruction.
+ if (isa<BitCastInst>(VPtr)) {
+ VPtr = cast<BitCastInst>(VPtr)->getOperand(0);
+ }
+ Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+ Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+ Type *VArgPtrType = PointerType::get(VArgType,
+ cast<PointerType>(IPtr->getType())->getAddressSpace());
+ BitCastInst* b = new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+ /* insert before */ FlipMemInputs ? J : I);
+
+ if (I->getMetadata("wi") != NULL) {
+ b->setMetadata("wi", I->getMetadata("wi"));
+ b->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ return b;
+ }
+
+ void WIVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+ unsigned IdxOffset, std::vector<Constant*> &Mask) {
+ for (unsigned v = 0; v < NumElem/2; ++v) {
+ int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+ if (m < 0) {
+ Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+ } else {
+ unsigned mm = m + (int) IdxOffset;
+ if (m >= (int) NumInElem)
+ mm += (int) NumInElem;
+
+ Mask[v+MaskOffset] =
+ ConstantInt::get(Type::getInt32Ty(Context), mm);
+ }
+ }
+ }
+
+ // Returns the value that is to be used as the vector-shuffle mask to the
+ // vector instruction that fuses I with J.
+ Value *WIVectorize::getReplacementShuffleMask(LLVMContext& Context,
+ Instruction *I, Instruction *J) {
+ // This is the shuffle mask. We need to append the second
+ // mask to the first, and the numbers need to be adjusted.
+
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+ // Get the total number of elements in the fused vector type.
+ // By definition, this must equal the number of elements in
+ // the final mask.
+ unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+ std::vector<Constant*> Mask(NumElem);
+
+ Type *OpType = I->getOperand(0)->getType();
+ unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+
+ // For the mask from the first pair...
+ fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+
+ // For the mask from the second pair...
+ fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
+ Mask);
+
+ return ConstantVector::get(Mask);
+ }
+
+ Value *WIVectorize::CommonShuffleSource(Instruction *I, Instruction *J) {
+ DenseMap<Value*, Value*>::iterator vi = storedSources.find(I);
+ DenseMap<Value*, Value*>::iterator vj = storedSources.find(J);
+ if (vi != storedSources.end()
+ && vj != storedSources.end()) {
+ if ((*vi).second == (*vj).second) {
+ return (*vi).second;
+ }
+ }
+ return NULL;
+ }
+ // Returns the value to be used as the specified operand of the vector
+ // instruction that fuses I with J.
+ Value *WIVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs) {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+ // Compute the fused vector type for this operand
+ Type *ArgType = I->getOperand(o)->getType();
+ Type *ArgTypeJ = J->getOperand(o)->getType();
+ VectorType *VArgType = getVecTypeForPair(ArgType, ArgTypeJ);
+ Instruction *L = I, *H = J;
+ if (FlipMemInputs) {
+ L = J;
+ H = I;
+ }
+
+ if (ArgType->isVectorTy()) {
+ ShuffleVectorInst *LSV
+ = dyn_cast<ShuffleVectorInst>(L->getOperand(o));
+ ShuffleVectorInst *HSV
+ = dyn_cast<ShuffleVectorInst>(H->getOperand(o));
+ if (LSV && HSV &&
+ LSV->getOperand(0)->getType() == HSV->getOperand(0)->getType() &&
+ LSV->getOperand(1)->getType() == HSV->getOperand(1)->getType() &&
+ LSV->getOperand(2)->getType() == HSV->getOperand(2)->getType()) {
+ if (LSV->getOperand(0) == HSV->getOperand(0) &&
+ LSV->getOperand(1) == HSV->getOperand(1)) {
+ if (LSV->getOperand(2)->getType()->getVectorNumElements() ==
+ HSV->getOperand(2)->getType()->getVectorNumElements()) {
+ unsigned elems =
+ LSV->getOperand(2)->getType()->getVectorNumElements();
+ bool continous = true;
+ bool identical = true;
+ unsigned start = cast<ShuffleVectorInst>(LSV)->getMaskValue(0);
+ for (unsigned i = 0; i < elems; i++) {
+ unsigned m = cast<ShuffleVectorInst>(LSV)->getMaskValue(i);
+ if (m != i)
+ continous = false;
+ if (m != start)
+ identical = false;
+ unsigned n = cast<ShuffleVectorInst>(HSV)->getMaskValue(i);
+ if (n != i + elems)
+ continous = false;
+ if (n != start)
+ identical = false;
+ }
+ // This is the case where both sources come from same value and
+ // are in order. e.g. 0,1,2,3,4,5,6,7, as produced when
+ // replacing outputs of vector operation.
+ if (continous && VArgType->getVectorNumElements() == elems*2) {
+ return LSV->getOperand(0);
+ }
+ // This is case where single value of input vector is replicated
+ // to whole output. Eventually should turn to buildvector MI.
+ if (identical) {
+ unsigned numElem =
+ cast<VectorType>(VArgType)->getNumElements();
+ std::vector<Constant*> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v)
+ Mask[v] =
+ ConstantInt::get(Type::getInt32Ty(Context), start);
+
+ Instruction *BV = new ShuffleVectorInst(
+ (start < numElem/2) ?
+ LSV->getOperand(0):
+ LSV->getOperand(1),
+ UndefValue::get(LSV->getOperand(0)->getType()),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (LSV->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", LSV->getMetadata("wi"));
+ BV->setMetadata("wi_counter", LSV->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+ }
+ }
+#if 0
+ // This was made obsolete by test for continuity of shuffle indexes above
+ // and should be removed after futher tests for performance degradation.
+ Value* res = CommonShuffleSource(LSV, HSV);
+ if (res &&
+ res->getType()->getVectorNumElements() ==
+ VArgType->getVectorNumElements()) {
+ return res;
+ }
+#endif
+ }
+ InsertElementInst *LIN
+ = dyn_cast<InsertElementInst>(L->getOperand(o));
+ InsertElementInst *HIN
+ = dyn_cast<InsertElementInst>(H->getOperand(o));
+
+ unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+ if (LIN && HIN) {
+ Instruction *newIn = InsertElementInst::Create(
+ UndefValue::get(VArgType),
+ LIN->getOperand(1),
+ LIN->getOperand(2),
+ getReplacementName(I, true, o, 1));
+ if (I->getMetadata("wi")) {
+ newIn->setMetadata("wi", I->getMetadata("wi"));
+ newIn->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ newIn->insertBefore(J);
+
+ LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0));
+ int counter = 2;
+ int rounds = 0;
+ while (rounds < 2) {
+ while(LIN) {
+ unsigned Indx = cast<ConstantInt>(LIN->getOperand(2))->getZExtValue();
+ Indx += rounds * (numElem/2);
+ Value *newIndx = ConstantInt::get(Type::getInt32Ty(Context), Indx);
+ newIn = InsertElementInst::Create(
+ newIn,
+ LIN->getOperand(1),
+ newIndx,
+ getReplacementName(I, true, o ,counter));
+ counter++;
+ if (I->getMetadata("wi")) {
+ newIn->setMetadata("wi", I->getMetadata("wi"));
+ newIn->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ newIn->insertBefore(J);
+ LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0));
+ }
+ rounds ++;
+ LIN = HIN;
+ }
+ return newIn;
+
+ }
+ std::vector<Constant*> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+
+ Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
+ H->getOperand(o),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (L->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", L->getMetadata("wi"));
+ BV->setMetadata("wi_counter", L->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ // If these two inputs are the output of another vector instruction,
+ // then we should use that output directly. It might be necessary to
+ // permute it first. [When pairings are fused recursively, you can
+ // end up with cases where a large vector is decomposed into scalars
+ // using extractelement instructions, then built into size-2
+ // vectors using insertelement and the into larger vectors using
+ // shuffles. InstCombine does not simplify all of these cases well,
+ // and so we make sure that shuffles are generated here when possible.
+ ExtractElementInst *LEE
+ = dyn_cast<ExtractElementInst>(L->getOperand(o));
+ ExtractElementInst *HEE
+ = dyn_cast<ExtractElementInst>(H->getOperand(o));
+
+ if (LEE && HEE &&
+ LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
+ VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
+ unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
+ unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
+ if (LEE->getOperand(0) == HEE->getOperand(0)) {
+ if (LowIndx == 0 && HighIndx == 1)
+ return LEE->getOperand(0);
+
+ std::vector<Constant*> Mask(2);
+ Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+ Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+ Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+ UndefValue::get(EEType),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (I->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", I->getMetadata("wi"));
+ BV->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ std::vector<Constant*> Mask(2);
+ HighIndx += EEType->getNumElements();
+ Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+ Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+ Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+ HEE->getOperand(0),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (I->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", I->getMetadata("wi"));
+ BV->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ Instruction *BV1 = InsertElementInst::Create(
+ UndefValue::get(VArgType),
+ L->getOperand(o), CV0,
+ getReplacementName(I, true, o, 1));
+ if (I->getMetadata("wi") != NULL) {
+ BV1->setMetadata("wi", I->getMetadata("wi"));
+ BV1->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+
+ BV1->insertBefore(I);
+
+ Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
+ CV1,
+ getReplacementName(I, true, o, 2));
+ if (J->getMetadata("wi") != NULL) {
+ BV2->setMetadata("wi",J->getMetadata("wi"));
+ BV2->setMetadata("wi_counter",J->getMetadata("wi_counter"));
+ }
+ BV2->insertBefore(J);
+ return BV2;
+ }
+
+ // This function creates an array of values that will be used as the inputs
+ // to the vector instruction that fuses I with J.
+ void WIVectorize::getReplacementInputsForPair(LLVMContext& Context,
+ Instruction *I, Instruction *J,
+ SmallVector<Value *, 3> &ReplacedOperands,
+ bool FlipMemInputs) {
+ unsigned NumOperands = I->getNumOperands();
+
+ for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+ // Iterate backward so that we look at the store pointer
+ // first and know whether or not we need to flip the inputs.
+
+ if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+ // This is the pointer for a load/store instruction.
+ ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
+ FlipMemInputs);
+ continue;
+ } else if (isa<CallInst>(I)) {
+ Function *F = cast<CallInst>(I)->getCalledFunction();
+ unsigned IID = F->getIntrinsicID();
+ if (o == NumOperands-1) {
+ BasicBlock &BB = *I->getParent();
+
+ Module *M = BB.getParent()->getParent();
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+ // FIXME: is it safe to do this here?
+ ReplacedOperands[o] = Intrinsic::getDeclaration(M,
+ (Intrinsic::ID) IID, VArgType);
+ continue;
+ } else if (IID == Intrinsic::powi && o == 1) {
+ // The second argument of powi is a single integer and we've already
+ // checked that both arguments are equal. As a result, we just keep
+ // I's second argument.
+ ReplacedOperands[o] = I->getOperand(o);
+ continue;
+ }
+ } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+ ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+ continue;
+ }
+
+ ReplacedOperands[o] =
+ getReplacementInput(Context, I, J, o, FlipMemInputs);
+ }
+ }
+ // As with the aliasing information, SCEV can also change because of
+ // vectorization. This information is used to compute relative pointer
+ // offsets; the necessary information will be cached here prior to
+ // fusion.
+ void WIVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<Value *> &LowPtrInsts) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PIE = PairableInsts.end(); PI != PIE; ++PI) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+ if (P == ChosenPairs.end()) continue;
+
+ Instruction *I = cast<Instruction>(P->first);
+ Instruction *J = cast<Instruction>(P->second);
+
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<GetElementPtrInst>(I))
+ continue;
+
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts;
+ if (!getPairPtrInfo(
+ I, J, IPtr, JPtr, IAlignment, JAlignment, IAddressSpace,
+ JAddressSpace, OffsetInElmts) || abs64(OffsetInElmts) != 1) {
+ if (!isa<GetElementPtrInst>(I))
+ llvm_unreachable("Pre-fusion pointer analysis failed");
+ }
+ Value *LowPI = (OffsetInElmts > 0) ? I : J;
+ LowPtrInsts.insert(LowPI);
+ }
+ }
+
+ // This function creates two values that represent the outputs of the
+ // original I and J instructions. These are generally vector shuffles
+ // or extracts. In many cases, these will end up being unused and, thus,
+ // eliminated by later passes.
+ void WIVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt,
+ Instruction *&K1, Instruction *&K2,
+ bool FlipMemInputs) {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+ if (isa<StoreInst>(I)) {
+ AA->replaceWithNewValue(I, K);
+ AA->replaceWithNewValue(J, K);
+ } else {
+ Type *IType = I->getType();
+ Type *JType = J->getType();
+
+ VectorType *VType = getVecTypeForPair(IType, JType);
+
+ if (IType->isVectorTy()) {
+ unsigned numElem = cast<VectorType>(IType)->getNumElements();
+ std::vector<Constant*> Mask1(numElem), Mask2(numElem);
+ for (unsigned v = 0; v < numElem; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
+ }
+
+ K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(
+ FlipMemInputs ? Mask2 : Mask1),
+ getReplacementName(K, false, 1));
+ K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(
+ FlipMemInputs ? Mask1 : Mask2),
+ getReplacementName(K, false, 2));
+ storedSources.insert(ValuePair(FlipMemInputs ? K1 : K2, K));
+ storedSources.insert(ValuePair(FlipMemInputs ? K2 : K1, K));
+ flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K1 : K2));
+ flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K2 : K1));
+ Instruction* L = I;
+ Instruction* H = J;
+ if (FlipMemInputs) {
+ L = J;
+ H = I;
+ }
+ VPIteratorPair v1 =
+ flippedStoredSources.equal_range(L);
+ for (std::multimap<Value*, Value*>::iterator ii = v1.first;
+ ii != v1.second; ii++) {
+ storedSources.erase((*ii).second);
+ storedSources.insert(ValuePair((*ii).second,K));
+ flippedStoredSources.insert(ValuePair(K, (*ii).second));
+ storedSources.erase(L);
+ }
+ flippedStoredSources.erase(L);
+ VPIteratorPair v2 = flippedStoredSources.equal_range(H);
+ for (std::multimap<Value*, Value*>::iterator ji = v2.first;
+ ji != v2.second; ji++) {
+ storedSources.erase((*ji).second);
+ storedSources.insert(ValuePair((*ji).second,K));
+ flippedStoredSources.insert(ValuePair(K, (*ji).second));
+ storedSources.erase(H);
+ }
+ flippedStoredSources.erase(H);
+ } else {
+ K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+ getReplacementName(K, false, 1));
+ K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+ getReplacementName(K, false, 2));
+ storedSources.insert(ValuePair(K1,K));
+ storedSources.insert(ValuePair(K2,K));
+ flippedStoredSources.insert(ValuePair(K, K1));
+ flippedStoredSources.insert(ValuePair(K, K2));
+ }
+ if (I->getMetadata("wi") != NULL) {
+ K1->setMetadata("wi", I->getMetadata("wi"));
+ K1->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ if (J->getMetadata("wi") != NULL) {
+ K2->setMetadata("wi", J->getMetadata("wi"));
+ K2->setMetadata("wi_counter", J->getMetadata("wi_counter"));
+ }
+
+ K1->insertAfter(K);
+ K2->insertAfter(K1);
+ InsertionPt = K2;
+ }
+ }
+
+ // Move all uses of the function I (including pairing-induced uses) after J.
+ void WIVectorize::moveUsesOfIAfterJ(BasicBlock &/*BB*/,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (; cast<Instruction>(L) != J;) {
+ if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+ // Move this instruction
+ Instruction *InstToMove = L; ++L;
+
+ InstToMove->removeFromParent();
+ InstToMove->insertAfter(InsertionPt);
+ InsertionPt = InstToMove;
+ } else {
+ ++L;
+ }
+ }
+ }
+
+
+ // Collect all load instruction that are in the move set of a given first
+ // pair member. These loads depend on the first instruction, I, and so need
+ // to be moved after J (the second instruction) when the pair is fused.
+ void WIVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &/*ChosenPairs*/,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+
+ // Note: We cannot end the loop when we reach J because J could be moved
+ // farther down the use chain by another instruction pairing. Also, J
+ // could be before I if this is an inverted input.
+ for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
+ if (trackUsesOfI(Users, WriteSet, I, L)) {
+ if (L->mayReadFromMemory())
+ LoadMoveSet.insert(ValuePair(L, I));
+ }
+ }
+ }
+
+ // In cases where both load/stores and the computation of their pointers
+ // are chosen for vectorization, we can end up in a situation where the
+ // aliasing analysis starts returning different query results as the
+ // process of fusing instruction pairs continues. Because the algorithm
+ // relies on finding the same use trees here as were found earlier, we'll
+ // need to precompute the necessary aliasing information here and then
+ // manually update it during the fusion process.
+ void WIVectorize::collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PIE = PairableInsts.end(); PI != PIE; ++PI) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+ if (P == ChosenPairs.end()) continue;
+
+ Instruction *I = cast<Instruction>(P->first);
+ collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+ }
+ }
+
+ // This function fuses the chosen instruction pairs into vector instructions,
+ // taking care preserve any needed scalar outputs and, then, it reorders the
+ // remaining instructions as needed (users of the first member of the pair
+ // need to be moved to after the location of the second member of the pair
+ // because the vector instruction is inserted in the location of the pair's
+ // second member).
+ void WIVectorize::fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs) {
+ LLVMContext& Context = BB.getContext();
+
+ // During the vectorization process, the order of the pairs to be fused
+ // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+ // list. After a pair is fused, the flipped pair is removed from the list.
+ std::vector<ValuePair> FlippedPairs;
+ FlippedPairs.reserve(ChosenPairs.size());
+ for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+ E = ChosenPairs.end(); P != E; ++P)
+ FlippedPairs.push_back(ValuePair(P->second, P->first));
+ for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+ E = FlippedPairs.end(); P != E; ++P)
+ ChosenPairs.insert(*P);
+
+ std::multimap<Value *, Value *> LoadMoveSet;
+ collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+ DenseSet<Value *> LowPtrInsts;
+ collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
+
+ DEBUG(dbgs() << "WIV: initial: \n" << BB << "\n");
+
+ for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+ if (P == ChosenPairs.end()) {
+ ++PI;
+ continue;
+ }
+
+ if (getDepthFactor(P->first) == 0) {
+ // These instructions are not really fused, but are tracked as though
+ // they are. Any case in which it would be interesting to fuse them
+ // will be taken care of by InstCombine.
+ --NumFusedOps;
+ ++PI;
+ continue;
+ }
+
+ Instruction *I = cast<Instruction>(P->first),
+ *J = cast<Instruction>(P->second);
+
+ DEBUG(dbgs() << "WIV: fusing: " << *I <<
+ " <-> " << *J << "\n");
+
+ // Remove the pair and flipped pair from the list.
+ DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+ assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+ ChosenPairs.erase(FP);
+ ChosenPairs.erase(P);
+
+ bool FlipMemInputs = false;
+ if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I))
+ FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+ unsigned NumOperands = I->getNumOperands();
+ SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+ getReplacementInputsForPair(Context, I, J, ReplacedOperands,
+ FlipMemInputs);
+
+ // Make a copy of the original operation, change its type to the vector
+ // type and replace its operands with the vector operands.
+ Instruction *K = I->clone();
+ if (I->hasName()) K->takeName(I);
+
+ if (I->getMetadata("wi") != NULL) {
+ K->setMetadata("wi", I->getMetadata("wi"));
+ K->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ if (!isa<StoreInst>(K))
+ K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+
+ for (unsigned o = 0; o < NumOperands; ++o)
+ K->setOperand(o, ReplacedOperands[o]);
+
+ // If we've flipped the memory inputs, make sure that we take the correct
+ // alignment.
+ if (FlipMemInputs) {
+ if (isa<StoreInst>(K))
+ cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
+ else
+ cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
+ }
+
+ K->insertAfter(J);
+
+ // Instruction insertion point:
+ Instruction *InsertionPt = K;
+ Instruction *K1 = 0, *K2 = 0;
+ replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
+ FlipMemInputs);
+
+ // The use tree of the first original instruction must be moved to after
+ // the location of the second instruction. The entire use tree of the
+ // first instruction is disjoint from the input tree of the second
+ // (by definition), and so commutes with it.
+
+ moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+
+ if (!isa<StoreInst>(I)) {
+ I->replaceAllUsesWith(K1);
+ J->replaceAllUsesWith(K2);
+ AA->replaceWithNewValue(I, K1);
+ AA->replaceWithNewValue(J, K2);
+ }
+
+ // Instructions that may read from memory may be in the load move set.
+ // Once an instruction is fused, we no longer need its move set, and so
+ // the values of the map never need to be updated. However, when a load
+ // is fused, we need to merge the entries from both instructions in the
+ // pair in case those instructions were in the move set of some other
+ // yet-to-be-fused pair. The loads in question are the keys of the map.
+ if (I->mayReadFromMemory()) {
+ std::vector<ValuePair> NewSetMembers;
+ VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
+ VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
+ for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
+ N != IPairRange.second; ++N)
+ NewSetMembers.push_back(ValuePair(K, N->second));
+ for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
+ N != JPairRange.second; ++N)
+ NewSetMembers.push_back(ValuePair(K, N->second));
+ for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+ AE = NewSetMembers.end(); A != AE; ++A)
+ LoadMoveSet.insert(*A);
+ }
+
+ // Before removing I, set the iterator to the next instruction.
+ PI = llvm::next(BasicBlock::iterator(I));
+ if (cast<Instruction>(PI) == J)
+ ++PI;
+
+ SE->forgetValue(I);
+ SE->forgetValue(J);
+ I->eraseFromParent();
+ J->eraseFromParent();
+ }
+
+ DEBUG(dbgs() << "WIV: final: \n" << BB << "\n");
+ }
+ void WIVectorize::dropUnused(BasicBlock& BB) {
+ bool changed;
+ do{
+ BasicBlock::iterator J = BB.end();
+ BasicBlock::iterator I = llvm::prior(J);
+ changed = false;
+ while (I != BB.begin()) {
+
+ if (isa<ShuffleVectorInst>(*I) ||
+ isa<ExtractElementInst>(*I) ||
+ isa<InsertElementInst>(*I) ||
+ isa<BitCastInst>(*I)) {
+
+ Value* V = dyn_cast<Value>(&(*I));
+
+ if (V && V->use_empty()) {
+ SE->forgetValue(&(*I));
+ (*I).eraseFromParent();
+ // removed instruction could have messed up things
+ // start again from the end
+ I = BB.end();
+ J = llvm::prior(I);
+ changed = true;
+ } else {
+ J = llvm::prior(I);
+ }
+ } else {
+ J = llvm::prior(I);
+ }
+ I = J;
+ }
+ } while (changed);
+ }
+
+ // Replace uses of alloca with new alloca.
+ // This includes getelementpointer, bitcast, load and store only
+ // atm.
+ // In case original alloca was array, the getelementpointer and bitcast apply.
+ void WIVectorize::replaceUses(BasicBlock& BB,
+ AllocaInst& oldAlloca,
+ AllocaInst& newAlloca,
+ int indx) {
+
+ LLVMContext& Context = BB.getContext();
+ Instruction::use_iterator useiter = oldAlloca.use_begin();
+
+ while (useiter != oldAlloca.use_end()) {
+ llvm::User* tmp = *useiter;
+
+ if (isa<BitCastInst>(tmp)) {
+ // Create new bitcast from new alloca to same type
+ // as old bitcast had. This is situation where the
+ // alloca is casted to i8* followed by
+ // call void @llvm.lifetime.start(i64 -1, i8* %XYZ) nounwind
+ BitCastInst* bitCast = cast<BitCastInst>(tmp);
+ IRBuilder<> builder(bitCast);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ &newAlloca, bitCast->getDestTy(), bitCast->getName()));
+
+ if (bitCast->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", bitCast->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", bitCast->getMetadata("wi_counter"));
+ }
+
+ bitCast->replaceAllUsesWith(newBitcast);
+ AA->replaceWithNewValue(bitCast, newBitcast);
+ SE->forgetValue(bitCast);
+ bitCast->eraseFromParent();
+
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+
+ if (isa<GetElementPtrInst>(tmp)) {
+ // Original getelementpointer contains number of indexes
+ // that indicate how to access element of allocated
+ // memory. Since we changed the most inner type to
+ // array, we add index to that array such as:
+ // Original alloca:
+ // %A = alloca [20 x [8 x i32]], align 4
+ // Original getelementpointer:
+ // %68 = getelementptr inbounds [20 x [8 x i32]]]* %A, i32 0, i32 %X, i32 0
+ // New alloca:
+ // %A = alloca [20 x [8 x [2 x i32]]], align 4
+ // new getelementpointer:
+ // %68 = getelementptr inbounds [20 x [8 x [2 x i32]]]* %A, i32 0, i32 %X, i32 0, i32 0
+
+ GetElementPtrInst* gep = cast<GetElementPtrInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ // Collect original indexes of getelementpointer
+ for (unsigned int i = 1; i <= gep->getNumIndices(); i++) {
+ gepArgs.push_back(gep->getOperand(i));
+ }
+ // Add index to the newly created array
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(gep);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(
+ builder.CreateGEP(&newAlloca, gepArgs, gep->getName()));
+ newGep->setIsInBounds(gep->isInBounds());
+
+ if (gep->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", gep->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", gep->getMetadata("wi_counter"));
+ }
+
+ gep->replaceAllUsesWith(newGep);
+ AA->replaceWithNewValue(gep, newGep);
+ SE->forgetValue(gep);
+ gep->eraseFromParent();
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ if (isa<StoreInst>(tmp)) {
+ // This is tricky, original alloca was for base type such
+ // as i32 or float so the variable was used directly.
+ // Now this is array so we have to add getelementpointer.
+ StoreInst* store = cast<StoreInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(store);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs));
+ if (store->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", store->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", store->getMetadata("wi_counter"));
+ }
+
+ for (unsigned int i = 0; i < store->getNumOperands(); i++) {
+ // Either of store operands could be alloca, we either
+ // store to allocated memory, or we are storing the pointer
+ // of the memory (this is rather dumb thing to do).
+ if (store->getOperand(i) == &oldAlloca) {
+ IRBuilder<> builder(store);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ newGep, store->getOperand(i)->getType()));
+ if (store->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", store->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", store->getMetadata("wi_counter"));
+ }
+ store->setOperand(i, newBitcast);
+ }
+ }
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ if (isa<LoadInst>(tmp)) {
+ // This is tricky, original alloca was for base type such
+ // as i32 or float so the variable was used directly.
+ // Now this is array so we have to add getelementpointer.
+
+ LoadInst* load = cast<LoadInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(load);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs));
+ if (load->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", load->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", load->getMetadata("wi_counter"));
+ }
+
+ for (unsigned int i = 0; i < load->getNumOperands(); i++) {
+ // Find operand of load that was old alloca and
+ // use bitcast to point to to getelementpointer result.
+ // There must be better way how to do this.
+ if (load->getOperand(i) == &oldAlloca) {
+ IRBuilder<> builder(load);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ newGep, load->getOperand(i)->getType()));
+ if (load->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", load->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", load->getMetadata("wi_counter"));
+ }
+ load->setOperand(i, newBitcast);
+ }
+ }
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ useiter++;
+ }
+ }
+
+ // Find new type for the vector alloca instruction
+ Type* WIVectorize::newAllocaType(Type* start, unsigned int width) {
+
+ if (start->isArrayTy()) {
+ // If type is still array check what is allocated type
+ int numElm = cast<ArrayType>(start)->getNumElements();
+ return ArrayType::get(
+ newAllocaType(
+ cast<SequentialType>(start)->getElementType(),
+ width)
+ , numElm);
+ } else if (start->isFirstClassType() && !start->isPointerTy()) {
+ // Recursion stopping point
+ // This should convert i32 to [width x i32] as base type of
+ // array
+ return ArrayType::get(start, width);
+ } else {
+ // Not recognized type, just return it, alloca won't be replaced
+ return start;
+ }
+ }
+
+ // In case there is private variable in the kernel that does not fit into
+ // register (multidimensional array for example), there are alloca
+ // defined to create necessary memory space for variable.
+ // Those are defined then for each of the work items replicated.
+ // This pass attempts to combine those allocas to create 'interleaved'
+ // memory allocation that then can be accessed by vector loads and stores
+ // as described bellow:
+ //
+ // __kernel xyz() {
+ //
+ // int A[100][100][100][100];
+ // ...
+ //}
+ // Will become after replication with 2 work items:
+ //
+ // %A = alloca [100 x [100 x [100 x i32]]], align 4
+ // %A_wi_1_0_0 = alloca [100 x [100 x [100 x i32]]], align 4
+ //
+ // This in will be converted here to :
+ // %A = alloca [100 x [100 x [100 x [2 x i32]]]], align 4
+ // And respective getelementpointer instruction will
+ // be added additional paramter to select correct member from the pair.
+ //
+ // NOTE: This does work only for arrays ATM, the scalar type allocas
+ // as produced by phistoallocas pass required for the work loops
+ // are skipped for now.
+
+ bool WIVectorize::vectorizeAllocas(BasicBlock& BB) {
+
+ std::multimap<int, ValueVector*> allocas;
+ getCandidateAllocas(BB, allocas);
+ bool changed = false;
+
+ for (std::multimap<int, ValueVector*>::iterator insIt = allocas.begin();
+ insIt != allocas.end(); insIt++) {
+ IRBuilder<> builder(
+ BB.getParent()->getEntryBlock().getFirstInsertionPt());
+
+ ValueVector* tmpVec = (*insIt).second;
+ // Create as 'wide' alloca as number of elements found,
+ // could be smaller then vector width or larger.
+ // Should be same as work group dimensions for work item replicas or
+ // same as number of unrolled loops with work item loops.
+ unsigned int allocaWidth = tmpVec->size();
+ // No point vectorizing one alloca only
+ if (allocaWidth <= 1)
+ continue;
+
+ AllocaInst* I = cast<AllocaInst>((*tmpVec)[0]);
+ Type* startType = I->getAllocatedType();
+ if (!startType->isArrayTy())
+ continue;
+ // Find new type for alloca by recursively searching through multiple
+ // dimensions of array
+ Type* newType = newAllocaType(startType, allocaWidth);
+
+ // No new type was found, alloca type not supported.
+ if (newType == startType)
+ continue;
+
+ changed = true;
+ llvm::AllocaInst *alloca =
+ builder.CreateAlloca(newType, 0, I->getName().str() + "_allocamix");
+ alloca->setAlignment(I->getAlignment());
+
+ if (I->getMetadata("wi") != NULL) {
+ alloca->setMetadata("wi", I->getMetadata("wi"));
+ alloca->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+
+ // Replace uses of first alloca with newly created one
+ MDNode* mi = I->getMetadata("wi");
+ assert(mi->getNumOperands() == 3);
+ // Second operand of MDNode contains MDNode with XYZ tripplet.
+ MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+
+ int index = dyn_cast<ConstantInt>(iXYZ->getOperand(1))->getZExtValue();
+
+ replaceUses(BB, *I, *alloca, index);
+ SE->forgetValue(I);
+ I->eraseFromParent();
+
+ // Replaces uses of other allocas with newly created one
+ for (unsigned int i = 1; i < allocaWidth; i++) {
+ AllocaInst* J = cast<AllocaInst>((*tmpVec)[i]);
+ MDNode* mj = J->getMetadata("wi");
+ assert(mj->getNumOperands() == 3);
+ MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2));
+ assert(jXYZ->getNumOperands() == 4);
+ int index =
+ dyn_cast<ConstantInt>(jXYZ->getOperand(1))->getZExtValue();
+
+ replaceUses(BB, *J, *alloca, index);
+ SE->forgetValue(J);
+ J->eraseFromParent();
+ }
+ }
+ return changed;
+ }
+
+ // Pass closely repated to getCandidatePairs, except this one only
+ // picks AllocaInst and makes sure they are from different work items.
+ // It also returns all instances of AllocaInst at the same time.
+ bool WIVectorize::getCandidateAllocas(BasicBlock &BB,
+ std::multimap<int, ValueVector*>& temporary) {
+
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+
+ if (!isa<AllocaInst>(I))
+ continue;
+ // TODO: This is bit tricky, should it be possible
+ // to create vector of allocas that do not have metadata?
+ if (I->getMetadata("wi") == NULL)
+ continue;
+
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+
+ std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI);
+ std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI);
+ ValueVector* tmpVec = NULL;
+ while(itb != ite) {
+ if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) {
+ // Test also if instructions are from same region.
+ MDNode* tmpMD =
+ cast<Instruction>((*(*itb).second)[0])->getMetadata("wi");
+ MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1));
+ unsigned tmpRI =
+ cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue();
+ if (RI == tmpRI)
+ tmpVec = (*itb).second;
+ }
+ itb++;
+ }
+ if (tmpVec == NULL) {
+ tmpVec = new ValueVector;
+ temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec));
+ }
+ tmpVec->push_back(I);
+ }
+ for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin();
+ insIt != temporary.end(); insIt++) {
+ ValueVector* tmpVec = (*insIt).second;
+ for (unsigned j = 0; j < tmpVec->size()/2; j++) {
+ Instruction* I = cast<Instruction>((*tmpVec)[2*j]);
+ Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]);
+ if (!areInstsCompatibleFromDifferentWi(I,J))
+ continue;
+ }
+ }
+ return true;
+ }
+
+}
+char WIVectorize::ID = 0;
+RegisterPass<WIVectorize>
+ X("wi-vectorize", "Work item vectorization.");
+
+FunctionPass *createWIVectorizePass() {
+ return new WIVectorize();
+}
+
diff --git a/src/llvmopencl/WorkItemAliasAnalysis.cc b/src/llvmopencl/WorkItemAliasAnalysis.cc
new file mode 100644
index 0000000..1d1fba7
--- /dev/null
+++ b/src/llvmopencl/WorkItemAliasAnalysis.cc
@@ -0,0 +1,119 @@
+/*
+ Copyright (c) 2012 Tampere University of Technology.
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * @file WorkItemAliasAnalysis.cc
+ *
+ * Definition of WorkItemAliasAnalysis class.
+ *
+ * @author Vladimír Guzma 2012
+ */
+
+#include "WorkItemAliasAnalysis.h"
+using namespace pocl;
+
+// Register this pass...
+char WorkItemAliasAnalysis::ID = 0;
+RegisterPass<WorkItemAliasAnalysis>
+ X("wi-aa", "Work item alias analysis.", false, false);
+// Register it also to pass group
+RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+ImmutablePass *createWorkItemAliasAnalysisPass() {
+ return new WorkItemAliasAnalysis();
+}
+
+extern "C" {
+ ImmutablePass*
+ create_workitem_aa_plugin() {
+ return new WorkItemAliasAnalysis();
+ }
+}
+
+void
+WorkItemAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AliasAnalysis::getAnalysisUsage(AU);
+}
+
+/**
+ * Test if memory locations are from different work items from same region.
+ * Then they can not alias.
+ */
+AliasAnalysis::AliasResult
+WorkItemAliasAnalysis::alias(const Location &LocA,
+ const Location &LocB) {
+ // If either of the memory references is empty, it doesn't matter what the
+ // pointer values are. This allows the code below to ignore this special
+ // case.
+ if (LocA.Size == 0 || LocB.Size == 0)
+ return NoAlias;
+
+ // Pointers from different address spaces do not alias
+ if (cast<PointerType>(LocA.Ptr->getType())->getAddressSpace() !=
+ cast<PointerType>(LocB.Ptr->getType())->getAddressSpace()) {
+ return NoAlias;
+ }
+ // In case code is created by pocl, we can also use metadata.
+ if (isa<Instruction>(LocA.Ptr) && isa<Instruction>(LocB.Ptr)) {
+ const Instruction* valA = dyn_cast<Instruction>(LocA.Ptr);
+ const Instruction* valB = dyn_cast<Instruction>(LocB.Ptr);
+ if (valA->getMetadata("wi") && valB->getMetadata("wi")) {
+ const MDNode* mdA = valA->getMetadata("wi");
+ const MDNode* mdB = valB->getMetadata("wi");
+ // Compare region ID. If they are same, different work items
+ // imply no aliasing. If regions are different or work items
+ // are same anything can happen.
+ // Fall back to other AAs.
+ const MDNode* mdRegionA = dyn_cast<MDNode>(mdA->getOperand(1));
+ const MDNode* mdRegionB = dyn_cast<MDNode>(mdB->getOperand(1));
+ ConstantInt* C1 = dyn_cast<ConstantInt>(mdRegionA->getOperand(1));
+ ConstantInt* C2 = dyn_cast<ConstantInt>(mdRegionB->getOperand(1));
+ if (C1->getValue() == C2->getValue()) {
+ // Now we have both locations from same region. Check for different
+ // work items.
+ MDNode* iXYZ= dyn_cast<MDNode>(mdA->getOperand(2));
+ MDNode* jXYZ= dyn_cast<MDNode>(mdB->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+ assert(jXYZ->getNumOperands() == 4);
+
+ ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1));
+ ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1));
+
+ ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2));
+ ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2));
+
+ ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3));
+ ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3));
+
+ if ( !(CIX->getValue() == CJX->getValue()
+ && CIY->getValue() == CJY->getValue()
+ && CIZ->getValue() == CJZ->getValue())) {
+ return NoAlias;
+ }
+ }
+ }
+ }
+
+ // Forward the query to the next analysis.
+ return AliasAnalysis::alias(LocA, LocB);
+}
diff --git a/src/llvmopencl/WorkItemAliasAnalysis.h b/src/llvmopencl/WorkItemAliasAnalysis.h
new file mode 100644
index 0000000..5c07a02
--- /dev/null
+++ b/src/llvmopencl/WorkItemAliasAnalysis.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (c) 2012 Tampere University of Technology.
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * @file WorkItemAliasAnalysis.cc
+ *
+ * Definition of WorkItemAliasAnalysis class.
+ *
+ * @author Vladimír Guzma 2012
+ */
+
+#include "config.h"
+#include <iostream>
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Pass.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#endif
+
+using namespace llvm;
+
+namespace pocl {
+/// WorkItemAliasAnalysis - This is a simple alias analysis
+/// implementation that uses pocl metadata to make sure memory accesses from
+/// different work items are not aliasing.
+class WorkItemAliasAnalysis : public llvm::ImmutablePass, public llvm::AliasAnalysis {
+public:
+ static char ID;
+ WorkItemAliasAnalysis() : ImmutablePass(ID) {}
+
+ /// getAdjustedAnalysisPointer - This method is used when a pass implements
+ /// an analysis interface through multiple inheritance. If needed, it
+ /// should override this to adjust the this pointer as needed for the
+ /// specified pass info.
+ virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+ if (PI == &AliasAnalysis::ID)
+ return (AliasAnalysis*)this;
+ return this;
+ }
+ virtual void initializePass() {
+ InitializeAliasAnalysis(this);
+ }
+
+ private:
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+ virtual AliasResult alias(const Location &LocA, const Location &LocB);
+
+ };
+}
+
diff --git a/src/llvmopencl/Workgroup.cc b/src/llvmopencl/Workgroup.cc
new file mode 100644
index 0000000..85cd84f
--- /dev/null
+++ b/src/llvmopencl/Workgroup.cc
@@ -0,0 +1,619 @@
+// LLVM module pass to create the single function (fully inlined)
+// and parallelized kernel for an OpenCL workgroup.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Workgroup.h"
+
+#include "CanonicalizeBarriers.h"
+#include "BarrierTailReplication.h"
+#include "WorkitemReplication.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstdio>
+#include <map>
+#include <iostream>
+
+#include "pocl.h"
+
+#define STRING_LENGTH 32
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+static void noaliasArguments(Function *F);
+static Function *createLauncher(Module &M, Function *F);
+static void privatizeContext(Module &M, Function *F);
+static void createWorkgroup(Module &M, Function *F);
+static void createWorkgroupFast(Module &M, Function *F);
+
+// extern cl::opt<string> Header;
+// extern cl::list<int> LocalSize;
+
+/* The kernel to process in this kernel compiler launch. */
+cl::opt<string>
+KernelName("kernel",
+ cl::desc("Kernel function name"),
+ cl::value_desc("kernel"),
+ cl::init(""));
+
+namespace llvm {
+
+ typedef struct _pocl_context PoclContext;
+
+ template<bool xcompile> class TypeBuilder<PoclContext, xcompile> {
+ public:
+ static StructType *get(LLVMContext &Context) {
+ if (size_t_width == 64)
+ {
+ return StructType::get
+ (TypeBuilder<types::i<32>, xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ NULL);
+ }
+ else if (size_t_width == 32)
+ {
+ return StructType::get
+ (TypeBuilder<types::i<32>, xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ NULL);
+ }
+ else
+ {
+ assert (false && "Unsupported size_t width.");
+ }
+ }
+
+ /**
+ * We compile for various targets with various widths for the size_t
+ * type that depends on the pointer type.
+ *
+ * This should be set when the correct type is known. This is a hack
+ * until a better way is found. It's not thread safe, e.g. if one
+ * compiles multiple Modules for multiple different pointer widths in
+ * a same process with multiple threads. */
+ static void setSizeTWidth(int width) {
+ size_t_width = width;
+ }
+
+ enum Fields {
+ WORK_DIM,
+ NUM_GROUPS,
+ GROUP_ID,
+ GLOBAL_OFFSET
+ };
+ private:
+ static int size_t_width;
+
+ };
+
+ template<bool xcompile>
+ int TypeBuilder<PoclContext, xcompile>::size_t_width = 0;
+
+} // namespace llvm
+
+char Workgroup::ID = 0;
+static RegisterPass<Workgroup> X("workgroup", "Workgroup creation pass");
+
+
+bool
+Workgroup::runOnModule(Module &M)
+{
+ if (M.getPointerSize() == llvm::Module::Pointer64)
+ {
+ TypeBuilder<PoclContext, true>::setSizeTWidth(64);
+ }
+ else if (M.getPointerSize() == llvm::Module::Pointer32)
+ {
+ TypeBuilder<PoclContext, true>::setSizeTWidth(32);
+ }
+ else
+ {
+ assert (false && "Target has an unsupported pointer width.");
+ }
+
+ for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+ if (!i->isDeclaration())
+ i->setLinkage(Function::InternalLinkage);
+ }
+
+ for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+ if (!isKernelToProcess(*i)) continue;
+ Function *L = createLauncher(M, i);
+
+#if defined LLVM_3_2
+ L->addFnAttr(Attributes::NoInline);
+#else
+ L->addFnAttr(Attribute::NoInline);
+#endif
+
+ privatizeContext(M, L);
+
+ createWorkgroup(M, L);
+ createWorkgroupFast(M, L);
+ }
+
+ Function *barrier = cast<Function>
+ (M.getOrInsertFunction("barrier",
+ Type::getVoidTy(M.getContext()),
+ Type::getInt32Ty(M.getContext()),
+ NULL));
+
+ BasicBlock *bb = BasicBlock::Create(M.getContext(), "", barrier);
+ ReturnInst::Create(M.getContext(), 0, bb);
+
+ return true;
+}
+
+/**
+ * Marks the pointer arguments to the kernel functions as noalias.
+ */
+static void
+noaliasArguments(Function *F)
+{
+ for (unsigned i = 0, e = F->getFunctionType()->getNumParams(); i < e; ++i)
+ if (isa<PointerType> (F->getFunctionType()->getParamType(i)))
+ F->setDoesNotAlias(i + 1); // arg 0 is return type
+}
+
+static Function *
+createLauncher(Module &M, Function *F)
+{
+ SmallVector<Type *, 8> sv;
+
+ for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+ i != e; ++i)
+ sv.push_back (i->getType());
+ sv.push_back(TypeBuilder<PoclContext*, true>::get(M.getContext()));
+
+ FunctionType *ft = FunctionType::get(Type::getVoidTy(M.getContext()),
+ ArrayRef<Type *> (sv),
+ false);
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+
+ Function *L = Function::Create(ft,
+ Function::ExternalLinkage,
+ "_" + funcName,
+ &M);
+
+ SmallVector<Value *, 8> arguments;
+ Function::arg_iterator ai = L->arg_begin();
+ for (unsigned i = 0, e = F->getArgumentList().size(); i != e; ++i) {
+ arguments.push_back(ai);
+ ++ai;
+ }
+
+ /* Copy the function attributes to transfer noalias etc. from the
+ original kernel which will be inlined into the launcher. */
+ L->setAttributes(F->getAttributes());
+
+ Value *ptr, *v;
+ char s[STRING_LENGTH];
+ GlobalVariable *gv;
+
+ IRBuilder<> builder(BasicBlock::Create(M.getContext(), "", L));
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::WORK_DIM);
+ gv = M.getGlobalVariable("_work_dim");
+ if (gv != NULL) {
+ v = builder.CreateLoad(builder.CreateConstGEP1_32(ptr, 0));
+ builder.CreateStore(v, gv);
+ }
+
+
+ int size_t_width = 32;
+ if (M.getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::GROUP_ID);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::NUM_GROUPS);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::GLOBAL_OFFSET);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ CallInst *c = builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+
+ InlineFunctionInfo IFI;
+ InlineFunction(c, IFI);
+
+ return L;
+}
+
+static void
+privatizeContext(Module &M, Function *F)
+{
+ char s[STRING_LENGTH];
+ GlobalVariable *gv[3];
+ AllocaInst *ai[3] = {NULL, NULL, NULL};
+
+ IRBuilder<> builder(F->getEntryBlock().getFirstNonPHI());
+
+ // Privatize _local_id
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_local_id_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _local_size
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_local_size_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _work_dim
+ gv[0] = M.getGlobalVariable("_work_dim");
+ if (gv[0] != NULL) {
+ ai[0] = builder.CreateAlloca(gv[0]->getType()->getElementType(),
+ 0, "_work_dim");
+ if(gv[0]->hasInitializer()) {
+ Constant *c = gv[0]->getInitializer();
+ builder.CreateStore(c, ai[0]);
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ ii->replaceUsesOfWith(gv[0], ai[0]);
+ }
+ }
+
+ // Privatize _num_groups
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _group_id
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _global_offset
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+}
+
+/**
+ * Creates a work group launcher function (called KERNELNAME_workgroup)
+ * that assumes kernel pointer arguments are stored as pointers to the
+ * actual buffers and that scalar data is loaded from the default memory.
+ */
+static void
+createWorkgroup(Module &M, Function *F)
+{
+ IRBuilder<> builder(M.getContext());
+
+ FunctionType *ft =
+ TypeBuilder<void(types::i<8>*[],
+ PoclContext*), true>::get(M.getContext());
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+
+ Function *workgroup =
+ dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup", ft));
+ assert(workgroup != NULL);
+
+ builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup));
+
+ Function::arg_iterator ai = workgroup->arg_begin();
+
+ SmallVector<Value*, 8> arguments;
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+ ii != ee; ++ii) {
+ Type *t = ii->getType();
+
+ Value *gep = builder.CreateGEP(ai,
+ ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
+ Value *pointer = builder.CreateLoad(gep);
+
+ /* If it's a pass by value pointer argument, we just pass the pointer
+ * as is to the function, no need to load form it first. */
+ Value *value;
+ if (ii->hasByValAttr()) {
+ value = builder.CreateBitCast(pointer, t);
+ } else {
+ value = builder.CreateBitCast(pointer, t->getPointerTo());
+ value = builder.CreateLoad(value);
+ }
+
+ arguments.push_back(value);
+ ++i;
+ }
+
+ arguments.back() = ++ai;
+
+ builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+}
+
+/**
+ * Creates a work group launcher more suitable for the heterogeneous
+ * host-device setup (called KERNELNAME_workgroup_fast).
+ *
+ * 1) Pointer arguments are stored directly as pointers to the
+ * buffers in the argument buffer.
+ *
+ * 2) Scalar values are loaded from the global memory address
+ * space.
+ *
+ * This should minimize copying of data and memory allocation
+ * at the device.
+ */
+static void
+createWorkgroupFast(Module &M, Function *F)
+{
+ IRBuilder<> builder(M.getContext());
+
+ FunctionType *ft =
+ TypeBuilder<void(types::i<8>*[],
+ PoclContext*), true>::get(M.getContext());
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+ Function *workgroup =
+ dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup_fast", ft));
+ assert(workgroup != NULL);
+
+ builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup));
+
+ Function::arg_iterator ai = workgroup->arg_begin();
+
+ SmallVector<Value*, 8> arguments;
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+ ii != ee; ++i, ++ii) {
+ Type *t = ii->getType();
+ Value *gep = builder.CreateGEP(ai,
+ ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
+ Value *pointer = builder.CreateLoad(gep);
+ Value *bc = NULL;
+
+ if (t->isPointerTy()) {
+ if (!ii->hasByValAttr()) {
+ /* Assume the pointer is directly in the arg array. */
+ arguments.push_back(builder.CreateBitCast(pointer, t));
+ continue;
+ }
+
+ /* It's a pass by value pointer argument, use the underlying
+ * element type in subsequent load. */
+ t = t->getPointerElementType();
+ }
+
+ /* Assume the pointer points to data in the global memory space. */
+ bc = builder.CreateBitCast(pointer,
+ t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL));
+
+ /* If it's a pass by value pointer argument, we just pass the pointer
+ * as is to the function, no need to load from it first. */
+ Value *value = builder.CreateBitCast(
+ pointer, t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL));
+ if (!ii->hasByValAttr()) {
+ value = builder.CreateLoad(value);
+ }
+
+ arguments.push_back(value);
+ }
+
+ arguments.back() = ++ai;
+
+ builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+}
+
+
+/**
+ * Returns true in case the given function is a kernel that
+ * should be processed by the kernel compiler.
+ */
+bool
+Workgroup::isKernelToProcess(const Function &F)
+{
+ const Module *m = F.getParent();
+
+ NamedMDNode *kernels = m->getNamedMetadata("opencl.kernels");
+ if (kernels == NULL) {
+ if (KernelName == "")
+ return true;
+ if (F.getName() == KernelName)
+ return true;
+
+ return false;
+ }
+
+ for (unsigned i = 0, e = kernels->getNumOperands(); i != e; ++i) {
+ if (kernels->getOperand(i)->getOperand(0) == NULL)
+ continue; // globaldce might have removed uncalled kernels
+ Function *k = cast<Function>(kernels->getOperand(i)->getOperand(0));
+ if (&F == k)
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/llvmopencl/Workgroup.h b/src/llvmopencl/Workgroup.h
new file mode 100644
index 0000000..26d7bfd
--- /dev/null
+++ b/src/llvmopencl/Workgroup.h
@@ -0,0 +1,48 @@
+// Header for Workgroup.cc module pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKGROUP_H
+#define _POCL_WORKGROUP_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Pass.h"
+
+namespace pocl {
+ class Workgroup : public llvm::ModulePass {
+ public:
+ static char ID;
+
+ Workgroup() : ModulePass(ID) {}
+
+ virtual bool runOnModule(llvm::Module &M);
+
+ static bool isKernelToProcess(const llvm::Function &F);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemHandler.cc b/src/llvmopencl/WorkitemHandler.cc
new file mode 100644
index 0000000..90ed294
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandler.cc
@@ -0,0 +1,278 @@
+// LLVM function pass to replicate the kernel body for all work items
+// in a work group.
+//
+// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and
+// Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Support/CommandLine.h"
+#include "WorkitemHandler.h"
+#include "Kernel.h"
+
+//#define DEBUG_REFERENCE_FIXING
+
+namespace pocl {
+
+using namespace llvm;
+
+cl::opt<bool>
+AddWIMetadata("add-wi-metadata", cl::init(false), cl::Hidden,
+ cl::desc("Adds a work item identifier to each of the instruction in work items."));
+
+
+WorkitemHandler::WorkitemHandler(char& ID) : FunctionPass(ID)
+{
+}
+
+bool
+WorkitemHandler::runOnFunction(Function &F)
+{
+ return false;
+}
+
+void
+WorkitemHandler::Initialize(Kernel *K)
+{
+ llvm::Module *M = K->getParent();
+
+ LocalSizeX = 3;
+ LocalSizeY = 1;
+ LocalSizeZ = 1;
+
+// TODO: are we searching reqd_workgroup_size here? If so, we need to enforce it.
+ llvm::NamedMDNode *size_info = M->getNamedMetadata("opencl.kernel_wg_size_info");
+ if (size_info) {
+ for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
+ llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
+ if (KernelSizeInfo->getOperand(0) == K) {
+ LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
+ LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
+ LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
+ }
+ }
+ }
+
+ llvm::Type *localIdType;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+ else if (M->getPointerSize() == llvm::Module::Pointer32)
+ size_t_width = 32;
+ else
+ assert (false && "Only 32 and 64 bit size_t widths supported.");
+
+ localIdType = IntegerType::get(K->getContext(), size_t_width);
+
+ localIdZ = M->getOrInsertGlobal(POCL_LOCAL_ID_Z_GLOBAL, localIdType);
+ localIdY = M->getOrInsertGlobal(POCL_LOCAL_ID_Y_GLOBAL, localIdType);
+ localIdX = M->getOrInsertGlobal(POCL_LOCAL_ID_X_GLOBAL, localIdType);
+
+ GlobalVariable *gvx = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL);
+ GlobalVariable *gvy = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL);
+ GlobalVariable *gvz = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL);
+ gvx->setSection(StringRef("far"));
+ gvy->setSection(StringRef("far"));
+ gvz->setSection(StringRef("far"));
+
+ //Value *lsx = M->getOrInsertGlobal("_local_size_x", localIdType);
+ //Value *lsy = M->getOrInsertGlobal("_local_size_y", localIdType);
+ //Value *lsz = M->getOrInsertGlobal("_local_size_z", localIdType);
+ //GlobalVariable *gsx = M->getNamedGlobal("_local_size_x");
+ //GlobalVariable *gsy = M->getNamedGlobal("_local_size_y");
+ //GlobalVariable *gsz = M->getNamedGlobal("_local_size_z");
+ //gsx->setSection(StringRef("far"));
+ //gsy->setSection(StringRef("far"));
+ //gsz->setSection(StringRef("far"));
+}
+
+bool
+WorkitemHandler::dominatesUse
+(llvm::DominatorTree *DT, Instruction &I, unsigned i) {
+ Instruction *Op = cast<Instruction>(I.getOperand(i));
+ BasicBlock *OpBlock = Op->getParent();
+ PHINode *PN = dyn_cast<PHINode>(&I);
+
+ // DT can handle non phi instructions for us.
+ if (!PN)
+ {
+ // Definition must dominate use unless use is unreachable!
+ return Op->getParent() == I.getParent() ||
+ DT->dominates(Op, &I);
+ }
+
+ // PHI nodes are more difficult than other nodes because they actually
+ // "use" the value in the predecessor basic blocks they correspond to.
+ unsigned j = PHINode::getIncomingValueNumForOperand(i);
+ BasicBlock *PredBB = PN->getIncomingBlock(j);
+ return (PredBB && DT->dominates(OpBlock, PredBB));
+}
+
+/* Fixes the undominated variable uses.
+
+ These appear when a conditional barrier kernel is replicated to
+ form a copy of the *same basic block* in the alternative
+ "barrier path".
+
+ E.g., from
+
+ A -> [exit], A -> B -> [exit]
+
+ a replicated CFG as follows, is created:
+
+ A1 -> (T) A2 -> [exit1], A1 -> (F) A2' -> B1, B2 -> [exit2]
+
+ The regions are correct because of the barrier semantics
+ of "all or none". In case any barrier enters the [exit1]
+ from A1, all must (because there's a barrier in the else
+ branch).
+
+ Here at A2 and A2' one creates the same variables.
+ However, B2 does not know which copy
+ to refer to, the ones created in A2 or ones in A2' (correct).
+ The mapping data contains only one possibility, the
+ one that was placed there last. Thus, the instructions in B2
+ might end up referring to the variables defined in A2
+ which do not nominate them.
+
+ The variable references are fixed by exploiting the knowledge
+ of the naming convention of the cloned variables.
+
+ One potential alternative way would be to collect the refmaps per BB,
+ not globally. Then as a final phase traverse through the
+ basic blocks starting from the beginning and propagating the
+ reference data downwards, the data from the new BB overwriting
+ the old one. This should ensure the reachability without
+ the costly dominance analysis.
+*/
+bool
+WorkitemHandler::fixUndominatedVariableUses(llvm::DominatorTree *DT,
+ llvm::Function &F)
+{
+ bool changed = false;
+ DT->runOnFunction(F);
+
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i)
+ {
+ llvm::BasicBlock *bb = i;
+ for (llvm::BasicBlock::iterator ins = bb->begin(), inse = bb->end();
+ ins != inse; ++ins)
+ {
+ for (unsigned opr = 0; opr < ins->getNumOperands(); ++opr)
+ {
+ if (!isa<Instruction>(ins->getOperand(opr))) continue;
+ Instruction *operand = cast<Instruction>(ins->getOperand(opr));
+ if (dominatesUse(DT, *ins, opr))
+ continue;
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### dominance error!" << std::endl;
+ operand->dump();
+ std::cout << "### does not dominate:" << std::endl;
+ ins->dump();
+#endif
+ StringRef baseName;
+ std::pair< StringRef, StringRef > pieces =
+ operand->getName().rsplit('.');
+ if (pieces.second.startswith("pocl_"))
+ baseName = pieces.first;
+ else
+ baseName = operand->getName();
+
+ Value *alternative = NULL;
+
+ unsigned int copy_i = 0;
+ do {
+ std::ostringstream alternativeName;
+ alternativeName << baseName.str();
+ if (copy_i > 0)
+ alternativeName << ".pocl_" << copy_i;
+
+ alternative =
+ F.getValueSymbolTable().lookup(alternativeName.str());
+
+ if (alternative != NULL)
+ {
+ ins->setOperand(opr, alternative);
+ if (dominatesUse(DT, *ins, opr))
+ break;
+ }
+
+ if (copy_i > 10000 && alternative == NULL)
+ break; /* ran out of possibilities */
+ ++copy_i;
+ } while (true);
+
+ if (alternative != NULL)
+ {
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### found the alternative:" << std::endl;
+ alternative->dump();
+#endif
+ changed |= true;
+ } else {
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### didn't fiund an alternative for" << std::endl;
+ operand->dump();
+ std::cerr << "### BB:" << std::endl;
+ operand->getParent()->dump();
+ std::cerr << "### the user BB:" << std::endl;
+ ins->getParent()->dump();
+#endif
+ std::cerr << "Could not find a dominating alternative variable." << std::endl;
+ abort();
+ }
+ }
+ }
+ }
+ return changed;
+}
+
+/**
+ * Moves the phi nodes in the beginning of the src to the beginning of
+ * the dst.
+ *
+ * MergeBlockIntoPredecessor function from llvm discards the phi nodes
+ * of the replicated BB because it has only one entry.
+ */
+void
+WorkitemHandler::movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst)
+{
+ while (PHINode *PN = dyn_cast<PHINode>(src->begin()))
+ PN->moveBefore(dst->getFirstNonPHI());
+}
+
+
+} // namespace pocl
diff --git a/src/llvmopencl/WorkitemHandler.h b/src/llvmopencl/WorkitemHandler.h
new file mode 100644
index 0000000..6654fa8
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandler.h
@@ -0,0 +1,73 @@
+// Header for WorkitemHandler, a parent class for all implementations of
+// work item handling.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_HANDLER_H
+#define _POCL_WORKITEM_HANDLER_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+ class DominatorTree;
+}
+
+namespace pocl {
+ class Workgroup;
+ class Kernel;
+
+ class WorkitemHandler : public llvm::FunctionPass {
+ public:
+
+ WorkitemHandler(char& ID);
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const = 0;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ virtual void Initialize(pocl::Kernel *K);
+
+ protected:
+
+ void movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst);
+ bool fixUndominatedVariableUses(llvm::DominatorTree *DT, llvm::Function &F);
+ bool dominatesUse(llvm::DominatorTree *DT, llvm::Instruction &I, unsigned i);
+
+ int LocalSizeX, LocalSizeY, LocalSizeZ;
+
+ unsigned size_t_width;
+
+ /* The global variables that store the current local id. */
+ llvm::Value *localIdZ, *localIdY, *localIdX;
+
+ };
+
+ extern llvm::cl::opt<bool> AddWIMetadata;
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemHandlerChooser.cc b/src/llvmopencl/WorkitemHandlerChooser.cc
new file mode 100644
index 0000000..4fcd226
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandlerChooser.cc
@@ -0,0 +1,111 @@
+// LLVM function pass to select the best way to create a work group
+// function for a kernel and work group size.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem-loops"
+
+#include "WorkitemHandlerChooser.h"
+#include "WorkitemLoops.h"
+#include "WorkitemReplication.h"
+#include "Workgroup.h"
+#include "CanonicalizeBarriers.h"
+#include "Kernel.h"
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+#include <iostream>
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<WorkitemHandlerChooser> X(
+ "workitem-handler-chooser",
+ "Finds the best way to handle work-items to produce a multi-WG function.",
+ false, false);
+
+}
+
+namespace pocl {
+
+char WorkitemHandlerChooser::ID = 0;
+
+void
+WorkitemHandlerChooser::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.setPreservesAll();
+}
+
+
+bool
+WorkitemHandlerChooser::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+#if 0
+ std::string method = "auto";
+ if (getenv("POCL_WORK_GROUP_METHOD") != NULL)
+ {
+ method = getenv("POCL_WORK_GROUP_METHOD");
+ if (method == "repl" || method == "workitemrepl")
+ chosenHandler_ = POCL_WIH_FULL_REPLICATION;
+ else if (method == "loops" || method == "workitemloops")
+ chosenHandler_ = POCL_WIH_LOOPS;
+ else if (method != "auto")
+ {
+ std::cerr << "Unknown work group generation method. Using 'auto'." << std::endl;
+ method = "auto";
+ }
+ }
+
+ if (method == "auto")
+ {
+ size_t ReplThreshold = 2;
+ if (getenv("POCL_FULL_REPLICATION_THRESHOLD") != NULL)
+ {
+ ReplThreshold = atoi(getenv("POCL_FULL_REPLICATION_THRESHOLD"));
+ }
+
+ if (LocalSizeX*LocalSizeY*LocalSizeZ <= ReplThreshold)
+ {
+ chosenHandler_ = POCL_WIH_FULL_REPLICATION;
+ }
+ else
+ {
+ chosenHandler_ = POCL_WIH_LOOPS;
+ }
+ }
+#else
+ chosenHandler_ = POCL_WIH_LOOPS;
+#endif
+
+ return false;
+}
+
+}
diff --git a/src/llvmopencl/WorkitemHandlerChooser.h b/src/llvmopencl/WorkitemHandlerChooser.h
new file mode 100644
index 0000000..ae317e3
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandlerChooser.h
@@ -0,0 +1,52 @@
+// Header for WorkitemHandlerChooser function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_HANDLER_CHOOSER_H
+#define _POCL_WORKITEM_HANDLER_CHOOSER_H
+
+#include "WorkitemHandler.h"
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemHandlerChooser : public pocl::WorkitemHandler {
+ public:
+ static char ID;
+
+ enum WorkitemHandlerType {
+ POCL_WIH_FULL_REPLICATION,
+ POCL_WIH_LOOPS
+ };
+
+ WorkitemHandlerChooser() : pocl::WorkitemHandler(ID),
+ chosenHandler_(POCL_WIH_LOOPS) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ WorkitemHandlerType chosenHandler() { return chosenHandler_; }
+ private:
+ WorkitemHandlerType chosenHandler_;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemLoops.cc b/src/llvmopencl/WorkitemLoops.cc
new file mode 100644
index 0000000..91eb055
--- /dev/null
+++ b/src/llvmopencl/WorkitemLoops.cc
@@ -0,0 +1,1061 @@
+// LLVM function pass to create loops that run all the work items
+// in a work group while respecting barrier synchronization points.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem-loops"
+
+#include "WorkitemLoops.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "config.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include <llvm/Support/InstIterator.h>
+#include "WorkitemHandlerChooser.h"
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+//#define DUMP_RESULT_CFG
+
+#ifdef DUMP_RESULT_CFG
+#include "llvm/Analysis/CFGPrinter.h"
+#endif
+
+//#define DEBUG_WORK_ITEM_LOOPS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<WorkitemLoops> X("workitemloops",
+ "Workitem loop generation pass");
+}
+
+char WorkitemLoops::ID = 0;
+
+void
+WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addRequired<PostDominatorTree>();
+ AU.addRequired<LoopInfo>();
+// TODO - Removed due to compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+#endif
+#endif
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+WorkitemLoops::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+ PDT = &getAnalysis<PostDominatorTree>();
+
+ tempInstructionIndex = 0;
+
+#if 0
+ std::cerr << "### original:" << std::endl;
+ F.viewCFG();
+#endif
+
+ bool changed = ProcessFunction(F);
+#ifdef DUMP_RESULT_CFG
+ FunctionPass* cfgPrinter = createCFGOnlyPrinterPass();
+ cfgPrinter->runOnFunction(F);
+#endif
+
+#if 0
+ std::cerr << "### after:" << std::endl;
+ F.viewCFG();
+#endif
+
+ changed |= fixUndominatedVariableUses(DT, F);
+
+#if 0
+ /* Split large BBs so we can print the Dot without it crashing. */
+ bool fchanged = false;
+ const int MAX_INSTRUCTIONS_PER_BB = 70;
+ do {
+ fchanged = false;
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ BasicBlock *b = i;
+
+ if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1)
+ {
+ int count = 0;
+ BasicBlock::iterator splitPoint = b->begin();
+ while (count < MAX_INSTRUCTIONS_PER_BB || isa<PHINode>(splitPoint))
+ {
+ ++splitPoint;
+ ++count;
+ }
+ SplitBlock(b, splitPoint, this);
+ fchanged = true;
+ break;
+ }
+ }
+
+ } while (fchanged);
+
+ F.viewCFG();
+#endif
+ contextArrays.clear();
+ tempInstructionIds.clear();
+
+ return changed;
+}
+
+std::pair<llvm::BasicBlock *, llvm::BasicBlock *>
+WorkitemLoops::CreateLoopAround
+(ParallelRegion &region,
+ llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB,
+ bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim,
+ bool addIncBlock, llvm::Instruction *lsizeDim)
+{
+ assert (localIdVar != NULL);
+
+ /*
+
+ Generate a structure like this for each loop level (x,y,z):
+
+ for.init:
+
+ ; if peeledFirst is false:
+ store i32 0, i32* %_local_id_x, align 4
+
+ ; if peeledFirst is true (assume the 0,0,0 iteration has been executed earlier)
+ ; assume _local_id_x_first is is initialized to 1 in the peeled pregion copy
+ store _local_id_x_first, i32* %_local_id_x, align 4
+ store i32 0, %_local_id_x_first
+
+ br label %for.body
+
+ for.body:
+
+ ; the parallel region code here
+
+ br label %for.inc
+
+ for.inc:
+
+ ; Separated inc and cond check blocks for easier loop unrolling later on.
+ ; Can then chain N times for.body+for.inc to unroll.
+
+ %2 = load i32* %_local_id_x, align 4
+ %inc = add nsw i32 %2, 1
+
+ store i32 %inc, i32* %_local_id_x, align 4
+ br label %for.cond
+
+ for.cond:
+
+ ; loop header, compare the id to the local size
+ %0 = load i32* %_local_id_x, align 4
+ %cmp = icmp ult i32 %0, i32 123
+ br i1 %cmp, label %for.body, label %for.end
+
+ for.end:
+
+ OPTIMIZE: Use a separate iteration variable across all the loops to iterate the context
+ data arrays to avoid needing multiplications to find the correct location, and to
+ enable easy vectorization of loading the context data when there are parallel iterations.
+ */
+
+ llvm::BasicBlock *loopBodyEntryBB = entryBB;
+ llvm::LLVMContext &C = loopBodyEntryBB->getContext();
+ llvm::Function *F = loopBodyEntryBB->getParent();
+ loopBodyEntryBB->setName("pregion.for.body");
+
+ assert (exitBB->getTerminator()->getNumSuccessors() == 1);
+
+ llvm::BasicBlock *oldExit = exitBB->getTerminator()->getSuccessor(0);
+
+ llvm::BasicBlock *forInitBB =
+ BasicBlock::Create(C, "pregion.for.init", F, loopBodyEntryBB);
+
+ llvm::BasicBlock *loopEndBB =
+ BasicBlock::Create(C, "pregion.for.end", F, exitBB);
+
+ llvm::BasicBlock *forCondBB =
+ BasicBlock::Create(C, "pregion.for.cond", F, exitBB);
+
+ DT->runOnFunction(*F);
+
+ // F->viewCFG();
+ /* Fix the old edges jumping to the region to jump to the basic block
+ that starts the created loop. Back edges should still point to the
+ old basic block so we preserve the old loops. */
+ BasicBlockVector preds;
+ llvm::pred_iterator PI =
+ llvm::pred_begin(entryBB),
+ E = llvm::pred_end(entryBB);
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ preds.push_back(bb);
+ }
+
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ /* Do not fix loop edges inside the region. The loop
+ is replicated as a whole to the body of the wi-loop.*/
+ if (DT->dominates(loopBodyEntryBB, bb))
+ continue;
+ bb->getTerminator()->replaceUsesOfWith(loopBodyEntryBB, forInitBB);
+ }
+
+ IRBuilder<> builder(forInitBB);
+
+ if (peeledFirst)
+ {
+ builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar);
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar);
+ }
+ else
+ {
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar);
+ }
+
+ builder.CreateBr(loopBodyEntryBB);
+
+ exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB);
+ if (addIncBlock)
+ {
+ AppendIncBlock(exitBB, localIdVar);
+ }
+
+ builder.SetInsertPoint(forCondBB);
+
+ llvm::Value *cmpResult;
+ if (lsizeDim == NULL)
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ (ConstantInt::get
+ (IntegerType::get(C, size_t_width),
+ LocalSizeForDim))
+ );
+ }
+ else
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ lsizeDim
+ );
+ }
+
+ Instruction *loopBranch =
+ builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB);
+
+ /* Add the metadata to mark a parallel loop. The metadata
+ refer to a loop-unique dummy metadata that is not merged
+ automatically. */
+
+ /* This creation of the identifier metadata is copied from
+ LLVM's MDBuilder::createAnonymousTBAARoot(). */
+ MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>());
+ MDNode *Root = MDNode::get(C, Dummy);
+ // At this point we have
+ // !0 = metadata !{} <- dummy
+ // !1 = metadata !{metadata !0} <- root
+ // Replace the dummy operand with the root node itself and delete the dummy.
+ Root->replaceOperandWith(0, Root);
+ MDNode::deleteTemporary(Dummy);
+ // We now have
+ // !1 = metadata !{metadata !1} <- self-referential root
+
+ loopBranch->setMetadata("llvm.loop.parallel", Root);
+ region.AddParallelLoopMetadata(Root);
+
+ builder.SetInsertPoint(loopEndBB);
+ builder.CreateBr(oldExit);
+
+ return std::make_pair(forInitBB, loopEndBB);
+}
+
+ParallelRegion*
+WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb)
+{
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ if (region->HasBlock(bb)) return region;
+ }
+ return NULL;
+}
+
+// PreAnalyze kernel function, find out dimension (borrowed from wga)
+// PreCreate local sizes which are workgroup invariant
+void WorkitemLoops::FindKernelDim(Function &F)
+{
+ maxDim = 1;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if (CallInst * callInst = dyn_cast<CallInst>(&*I))
+ {
+ if (!callInst->getCalledFunction()) continue;
+ std::string functionName(callInst->getCalledFunction()->getName());
+
+ if (functionName == "get_local_id" ||
+ functionName == "get_global_id")
+ {
+ Value *arg = callInst->getArgOperand(0);
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ {
+ unsigned int dimIdx = constInt->getSExtValue();
+ dimIdx = (MAX_DIMENSIONS-1 < dimIdx) ? MAX_DIMENSIONS-1 : dimIdx;
+ maxDim = (maxDim < dimIdx + 1) ? dimIdx+1 : maxDim;
+ }
+
+ /*-------------------------------------------------------------
+ * if the work group function has a variable argument, then
+ * assume worst case and return 3 loop levels are needed.
+ *------------------------------------------------------------*/
+ else
+ {
+ maxDim = 3;
+ break;
+ }
+ }
+ }
+
+ llvm::Module *M = F.getParent();
+ llvm::Type *Int32 = IntegerType::get(M->getContext(), 32);
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ Int32,
+ /*Params=*/ Int32,
+ /*isVarArg=*/ false);
+ Function *f_localsize =
+ dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft));
+ SmallVector<Value *, 4> argsx, argsy, argsz;
+ argsx.push_back(ConstantInt::get(Int32, 0));
+ lsizeX = CallInst::Create(f_localsize, ArrayRef<Value *>(argsx));
+ if (maxDim > 1)
+ {
+ argsy.push_back(ConstantInt::get(Int32, 1));
+ lsizeY = CallInst::Create(f_localsize, ArrayRef<Value *>(argsy));
+ }
+ if (maxDim > 2)
+ {
+ argsz.push_back(ConstantInt::get(Int32, 2));
+ lsizeZ = CallInst::Create(f_localsize, ArrayRef<Value *>(argsz));
+ }
+}
+
+bool
+WorkitemLoops::ProcessFunction(Function &F)
+{
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+#if 0 // TODO: do something for reqd_work_group_size
+ unsigned workItemCount = LocalSizeX*LocalSizeY*LocalSizeZ;
+ if (workItemCount == 1)
+ {
+ K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+ return true;
+ }
+#endif
+
+ FindKernelDim(F);
+
+ original_parallel_regions =
+ K->getParallelRegions(LI);
+
+ IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt());
+ localIdXFirstVar =
+ builder.CreateAlloca
+ (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init");
+
+ // F.viewCFGOnly();
+
+#if 0
+ std::cerr << "### Original" << std::endl;
+ F.viewCFG();
+#endif
+
+#if 0
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ region->InjectRegionPrintF();
+ region->InjectVariablePrintouts();
+ }
+#endif
+
+ /* Count how many parallel regions share each entry node to
+ detect diverging regions that need to be peeled. */
+ std::map<llvm::BasicBlock*, int> entryCounts;
+
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### Adding context save/restore for PR: ";
+ region->dumpNames();
+#endif
+ FixMultiRegionVariables(region);
+ entryCounts[region->entryBB()]++;
+ }
+
+#if 0
+ std::cerr << "### After context code addition:" << std::endl;
+ F.viewCFG();
+#endif
+ std::map<ParallelRegion*, bool> peeledRegion;
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+
+ llvm::ValueToValueMapTy reference_map;
+ ParallelRegion *original = (*i);
+
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### handling region:" << std::endl;
+ original->dumpNames();
+ //F.viewCFGOnly();
+#endif
+
+ /* In case of conditional barriers, the first iteration
+ has to be peeled so we know which branch to execute
+ with the work item loop. In case there are more than one
+ parallel region sharing an entry BB, it's a diverging
+ region.
+
+ Post dominance of entry by exit does not work in case the
+ region is inside a loop and the exit block is in the path
+ towards the loop exit (and the function exit).
+ */
+ bool peelFirst = entryCounts[original->entryBB()] > 1;
+
+ peeledRegion[original] = peelFirst;
+
+ std::pair<llvm::BasicBlock *, llvm::BasicBlock *> l;
+ // the original predecessor nodes of which successor
+ // should be fixed if not peeling
+ BasicBlockVector preds;
+
+ bool unrolled = false;
+ if (peelFirst)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### conditional region, peeling the first iteration" << std::endl;
+#endif
+ ParallelRegion *replica =
+ original->replicate(reference_map, ".peeled_wi");
+ replica->chainAfter(original);
+ replica->purge();
+
+ l = std::make_pair(replica->entryBB(), replica->exitBB());
+ }
+ else
+ {
+ llvm::pred_iterator PI =
+ llvm::pred_begin(original->entryBB()),
+ E = llvm::pred_end(original->entryBB());
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ if (DT->dominates(original->entryBB(), bb) &&
+ (RegionOfBlock(original->entryBB()) ==
+ RegionOfBlock(bb)))
+ continue;
+ preds.push_back(bb);
+ }
+
+#if 0
+ int unrollCount;
+ if (getenv("POCL_WILOOPS_MAX_UNROLL_COUNT") != NULL)
+ unrollCount = atoi(getenv("POCL_WILOOPS_MAX_UNROLL_COUNT"));
+ else
+ unrollCount = 1;
+ /* Find a two's exponent unroll count, if available. */
+ while (unrollCount >= 1)
+ {
+ if (LocalSizeX % unrollCount == 0 &&
+ unrollCount <= LocalSizeX)
+ {
+ break;
+ }
+ unrollCount /= 2;
+ }
+
+ if (unrollCount > 1) {
+ ParallelRegion *prev = original;
+ llvm::BasicBlock *lastBB =
+ AppendIncBlock(original->exitBB(), localIdX);
+ original->AddBlockAfter(lastBB, original->exitBB());
+ original->SetExitBB(lastBB);
+
+ if (AddWIMetadata)
+ original->AddIDMetadata(F.getContext(), 0);
+
+ for (int c = 1; c < unrollCount; ++c)
+ {
+ ParallelRegion *unrolled =
+ original->replicate(reference_map, ".unrolled_wi");
+ unrolled->chainAfter(prev);
+ prev = unrolled;
+ lastBB = unrolled->exitBB();
+ if (AddWIMetadata)
+ unrolled->AddIDMetadata(F.getContext(), c);
+ }
+ unrolled = true;
+ l = std::make_pair(original->entryBB(), lastBB);
+ } else {
+ l = std::make_pair(original->entryBB(), original->exitBB());
+ }
+#else
+ l = std::make_pair(original->entryBB(), original->exitBB());
+#endif
+ }
+
+ l = CreateLoopAround(*original, l.first, l.second, peelFirst, localIdX,
+ LocalSizeX, true, lsizeX);
+ if (maxDim > 1)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdY,
+ LocalSizeY, true, lsizeY);
+ if (maxDim > 2)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdZ,
+ LocalSizeZ, true, lsizeZ);
+
+ /* Loop edges coming from another region mean B-loops which means
+ we have to fix the loop edge to jump to the beginning of the wi-loop
+ structure, not its body. This has to be done only for non-peeled
+ blocks as the semantics is correct in the other case (the jump is
+ to the beginning of the peeled iteration). */
+ if (!peelFirst)
+ {
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ bb->getTerminator()->replaceUsesOfWith
+ (original->entryBB(), l.first);
+ }
+ }
+ }
+
+ // for the peeled regions we need to add a prologue
+ // that initializes the local ids and the first iteration
+ // counter
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *pr = (*i);
+
+ if (!peeledRegion[pr]) continue;
+ pr->insertPrologue(0, 0, 0);
+ builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt());
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1),
+ localIdXFirstVar);
+ }
+
+ // Creating lsize* values have been hoisted up
+ // K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ llvm::Instruction *inspt = F.getEntryBlock().getFirstNonPHI();
+ inspt->getParent()->getInstList().insert(inspt, lsizeX);
+ if (maxDim > 1)
+ inspt->getParent()->getInstList().insert(inspt, lsizeY);
+ if (maxDim > 2)
+ inspt->getParent()->getInstList().insert(inspt, lsizeZ);
+ // llvm::GlobalVariable *gvx = M->getGlobalVariable("_local_size_x");
+ // llvm::GlobalVariable *gvy = M->getGlobalVariable("_local_size_y");
+ // llvm::GlobalVariable *gvz = M->getGlobalVariable("_local_size_z");
+ // llvm::Instruction *storex = new StoreInst(lsizeX, gvx, inspt);
+ // llvm::Instruction *storey = new StoreInst(lsizeY, gvy, inspt);
+ // llvm::Instruction *storez = new StoreInst(lsizeZ, gvz, inspt);
+
+
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+
+#if 0
+ F.viewCFG();
+#endif
+
+ return true;
+}
+
+/*
+ * Add context save/restore code to variables that are defined in
+ * the given region and are used outside the region.
+ *
+ * Each such variable gets a slot in the stack frame. The variable
+ * is restored from the stack whenever it's used.
+ *
+ */
+void
+WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region)
+{
+ InstructionIndex instructionsInRegion;
+ InstructionVec instructionsToFix;
+
+ /* Construct an index of the region's instructions so it's
+ fast to figure out if the variable uses are all
+ in the region. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+ instructionsInRegion.insert(instruction);
+ }
+ }
+
+ /* Find all the instructions that define new values and
+ check if they need to be context saved. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+
+ if (ShouldNotBeContextSaved(instr)) continue;
+
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ // if the instruction is used outside this region inside another
+ // region (not in a regionless BB like the B-loop construct BBs),
+ // need to context save it.
+ if (instructionsInRegion.find(user) == instructionsInRegion.end() &&
+ RegionOfBlock(user->getParent()) != NULL)
+ {
+ instructionsToFix.push_back(instruction);
+ break;
+ }
+ }
+ }
+ }
+
+ /* Finally, fix the instructions. */
+ for (InstructionVec::iterator i = instructionsToFix.begin();
+ i != instructionsToFix.end(); ++i)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context/save restore for" << std::endl;
+ (*i)->dump();
+#endif
+ llvm::Instruction *instructionToFix = *i;
+ AddContextSaveRestore(instructionToFix);
+ }
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextSave
+(llvm::Instruction *instruction, llvm::Instruction *alloca)
+{
+
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved is itself an alloca,
+ we have created one big alloca that stores the data of all the
+ work-items and return pointers to that array. Thus, we need
+ no initialization code other than the context data alloca itself. */
+ return NULL;
+ }
+
+ /* Save the produced variable to the array. */
+ BasicBlock::iterator definition = dyn_cast<Instruction>(instruction);
+
+ ++definition;
+ while (isa<PHINode>(definition)) ++definition;
+
+ IRBuilder<> builder(definition);
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(instruction->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize index computation for store into alloca
+// alloca[idz * sizey*sizex + idy * sizex + idx]
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
+
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextRestore
+(llvm::Value *val, llvm::Instruction *alloca, llvm::Instruction *before,
+ bool isAlloca)
+{
+ assert (val != NULL);
+ IRBuilder<> builder(alloca);
+ if (before != NULL)
+ {
+ builder.SetInsertPoint(before);
+ }
+ else if (isa<Instruction>(val))
+ {
+ builder.SetInsertPoint(dyn_cast<Instruction>(val));
+ before = dyn_cast<Instruction>(val);
+ }
+ else
+ {
+ assert (false && "Unknown context restore location!");
+ }
+
+
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(before->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize alloca loads
+// idz * _local_size_x * _local_size_y + idy * _local_size_x + idx
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ llvm::Instruction *gep =
+ dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
+
+ if (isAlloca) {
+ /* In case the context saved instruction was an alloca, we created a
+ context array with pointed-to elements, and now want to return a pointer
+ to the elements to emulate the original alloca. */
+ return gep;
+ }
+ return builder.CreateLoad(gep);
+}
+
+/**
+ * Returns the context array (alloca) for the given Value, creates it if not
+ * found.
+ */
+llvm::Instruction *
+WorkitemLoops::GetContextArray(llvm::Instruction *instruction)
+{
+
+ /*
+ * Unnamed temp instructions need a generated name for the
+ * context array. Create one using a running integer.
+ */
+ std::ostringstream var;
+ var << ".";
+
+ if (std::string(instruction->getName().str()) != "")
+ {
+ var << instruction->getName().str();
+ }
+ else if (tempInstructionIds.find(instruction) != tempInstructionIds.end())
+ {
+ var << tempInstructionIds[instruction];
+ }
+ else
+ {
+ tempInstructionIds[instruction] = tempInstructionIndex++;
+ var << tempInstructionIds[instruction];
+ }
+
+ var << ".pocl_context";
+ std::string varName = var.str();
+
+ if (contextArrays.find(varName) != contextArrays.end())
+ return contextArrays[varName];
+
+ IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt());
+
+ llvm::Type *elementType;
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved was itself an alloca,
+ create one big alloca that stores the data of all the
+ work-items and directly return pointers to that array.
+ This enables moving all the allocas to the entry node without
+ breaking the parallel loop.
+ Otherwise we would rely on a dynamic alloca to allocate
+ unique stack space to all the work-items when its wiloop
+ iteration is executed. */
+ elementType =
+ dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
+ }
+ else
+ {
+ elementType = instruction->getType();
+ }
+
+// parameterize alloca to be based on _local_size_{x,y,z}
+ llvm::Value *wgsize = lsizeX;
+ if (maxDim > 1) wgsize = builder.CreateMul(wgsize, lsizeY);
+ if (maxDim > 2) wgsize = builder.CreateMul(wgsize, lsizeZ);
+ llvm::Type *contextArrayType = ArrayType::get(elementType, 1);
+ llvm::Instruction *alloca =
+ builder.CreateAlloca(elementType, wgsize, varName);
+
+ contextArrays[varName] = alloca;
+ return alloca;
+}
+
+
+/**
+ * Adds context save/restore code for the value produced by the
+ * given instruction.
+ *
+ * TODO: add only one restore per variable per region.
+ * TODO: add only one load of the id variables per region.
+ * Could be done by having a context restore BB in the beginning of the
+ * region and a context save BB at the end.
+ * TODO: ignore work group variables completely (the iteration variables)
+ * The LLVM should optimize these away but it would improve
+ * the readability of the output during debugging.
+ * TODO: rematerialize some values such as extended values of global
+ * variables (especially global id which is computed from local id) or kernel
+ * argument values instead of allocating stack space for them
+ */
+void
+WorkitemLoops::AddContextSaveRestore
+(llvm::Instruction *instruction) {
+
+ /* Allocate the context data array for the variable. */
+ llvm::Instruction *alloca = GetContextArray(instruction);
+ llvm::Instruction *theStore = AddContextSave(instruction, alloca);
+
+ InstructionVec uses;
+ /* Restore the produced variable before each use to ensure the correct context
+ copy is used.
+
+ We could add the restore only to other regions outside the
+ variable defining region and use the original variable in the defining
+ region due to the SSA virtual registers being unique. However,
+ alloca variables can be redefined also in the same region, thus we
+ need to ensure the correct alloca context position is written, not
+ the original unreplicated one. These variables can be generated by
+ volatile variables, private arrays, and due to the PHIs to allocas
+ pass.
+ */
+
+ /* Find out the uses to fix first as fixing them invalidates
+ the iterator. */
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ if (user == theStore) continue;
+ uses.push_back(user);
+ }
+
+ for (InstructionVec::iterator i = uses.begin(); i != uses.end(); ++i)
+ {
+ Instruction *user = *i;
+ Instruction *contextRestoreLocation = user;
+ /* If the user is in a block that doesn't belong to a region,
+ the variable itself must be a "work group variable", that is,
+ not dependent on the work item. Most likely an iteration
+ variable of a for loop with a barrier. */
+ if (RegionOfBlock(user->getParent()) == NULL) continue;
+
+ PHINode* phi = dyn_cast<PHINode>(user);
+ if (phi != NULL)
+ {
+ /* In case of PHI nodes, we cannot just insert the context
+ restore code before it in the same basic block because it is
+ assumed there are no non-phi Instructions before PHIs which
+ the context restore code constitutes to. Add the context
+ restore to the incomingBB instead.
+
+ There can be values in the PHINode that are incoming
+ from another region even though the decision BB is within the region.
+ For those values we need to add the context restore code in the
+ incoming BB (which is known to be inside the region due to the
+ assumption of not having to touch PHI nodes in PRentry BBs).
+ */
+
+ /* PHINodes at region entries are broken down earlier. */
+ assert ("Cannot add context restore for a PHI node at the region entry!" &&
+ RegionOfBlock(phi->getParent())->entryBB() != phi->getParent());
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context restore code before PHI" << std::endl;
+ user->dump();
+ std::cerr << "### in BB:" << std::endl;
+ user->getParent()->dump();
+#endif
+ BasicBlock *incomingBB = NULL;
+ for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+ ++incoming)
+ {
+ Value *val = phi->getIncomingValue(incoming);
+ BasicBlock *bb = phi->getIncomingBlock(incoming);
+ if (val == instruction) incomingBB = bb;
+ }
+ assert (incomingBB != NULL);
+ contextRestoreLocation = incomingBB->getTerminator();
+ }
+ llvm::Value *loadedValue =
+ AddContextRestore
+ (user, alloca, contextRestoreLocation, isa<AllocaInst>(instruction));
+ user->replaceUsesOfWith(instruction, loadedValue);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### done, the user was converted to:" << std::endl;
+ user->dump();
+#endif
+ }
+}
+
+bool
+WorkitemLoops::ShouldNotBeContextSaved(llvm::Instruction *instr)
+{
+ /*
+ _local_id loads should not be replicated as it leads to
+ problems in conditional branch case where the header node
+ of the region is shared across the branches and thus the
+ header node's ID loads might get context saved which leads
+ to egg-chicken problems.
+ */
+ llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
+ if (load != NULL &&
+ (load->getPointerOperand() == localIdZ ||
+ load->getPointerOperand() == localIdY ||
+ load->getPointerOperand() == localIdX))
+ return true;
+ return false;
+}
+
+llvm::BasicBlock *
+WorkitemLoops::AppendIncBlock
+(llvm::BasicBlock* after, llvm::Value *localIdVar)
+{
+ llvm::LLVMContext &C = after->getContext();
+
+ llvm::BasicBlock *oldExit = after->getTerminator()->getSuccessor(0);
+ assert (oldExit != NULL);
+
+ llvm::BasicBlock *forIncBB =
+ BasicBlock::Create(C, "pregion.for.inc", after->getParent());
+
+ after->getTerminator()->replaceUsesOfWith(oldExit, forIncBB);
+
+ IRBuilder<> builder(oldExit);
+
+ builder.SetInsertPoint(forIncBB);
+ /* Create the iteration variable increment */
+ builder.CreateStore
+ (builder.CreateAdd
+ (builder.CreateLoad(localIdVar),
+ ConstantInt::get(IntegerType::get(C, size_t_width), 1)),
+ localIdVar);
+
+ builder.CreateBr(oldExit);
+
+ return forIncBB;
+}
diff --git a/src/llvmopencl/WorkitemLoops.h b/src/llvmopencl/WorkitemLoops.h
new file mode 100644
index 0000000..aac4cfa
--- /dev/null
+++ b/src/llvmopencl/WorkitemLoops.h
@@ -0,0 +1,112 @@
+// Header for WorkitemLoops function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_LOOPS_H
+#define _POCL_WORKITEM_LOOPS_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <vector>
+#include "WorkitemHandler.h"
+#include "ParallelRegion.h"
+
+#define MAX_DIMENSIONS 3u
+
+namespace llvm {
+ class PostDominatorTree;
+}
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemLoops : public pocl::WorkitemHandler {
+
+ public:
+ static char ID;
+
+ WorkitemLoops() : pocl::WorkitemHandler(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::set<llvm::Instruction* > InstructionIndex;
+ typedef std::vector<llvm::Instruction* > InstructionVec;
+ typedef std::map<std::string, llvm::Instruction*> StrInstructionMap;
+
+ InstructionIndex workGroupVariables;
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+ llvm::PostDominatorTree *PDT;
+
+ ParallelRegion::ParallelRegionVector *original_parallel_regions;
+
+ StrInstructionMap contextArrays;
+
+ virtual bool ProcessFunction(llvm::Function &F);
+
+ void FixMultiRegionVariables(ParallelRegion *region);
+ void AddContextSaveRestore(llvm::Instruction *instruction);
+
+ llvm::Instruction *AddContextSave(llvm::Instruction *instruction, llvm::Instruction *alloca);
+ llvm::Instruction *AddContextRestore
+ (llvm::Value *val, llvm::Instruction *alloca,
+ llvm::Instruction *before=NULL,
+ bool isAlloca=false);
+ llvm::Instruction *GetContextArray(llvm::Instruction *val);
+
+ std::pair<llvm::BasicBlock *, llvm::BasicBlock *>
+ CreateLoopAround
+ (ParallelRegion &region, llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB,
+ bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim,
+ bool addIncBlock=true, llvm::Instruction *lsizeDim=NULL);
+ void FindKernelDim(llvm::Function &F);
+
+ llvm::BasicBlock *
+ AppendIncBlock
+ (llvm::BasicBlock* after,
+ llvm::Value *localIdVar);
+
+ ParallelRegion* RegionOfBlock(llvm::BasicBlock *bb);
+
+ bool ShouldNotBeContextSaved(llvm::Instruction *instr);
+
+ std::map<llvm::Instruction*, unsigned> tempInstructionIds;
+ size_t tempInstructionIndex;
+ // An alloca in the kernel which stores the first iteration to execute
+ // in the inner (dimension 0) loop. This is set to 1 in an peeled iteration
+ // to skip the 0, 0, 0 iteration in the loops.
+ llvm::Value *localIdXFirstVar;
+
+ unsigned int maxDim;
+ llvm::Instruction *lsizeX, *lsizeY, *lsizeZ;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemReplication.cc b/src/llvmopencl/WorkitemReplication.cc
new file mode 100644
index 0000000..b6ea3cd
--- /dev/null
+++ b/src/llvmopencl/WorkitemReplication.cc
@@ -0,0 +1,308 @@
+// LLVM function pass to replicate the kernel body for all work items
+// in a work group.
+//
+// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and
+// Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem"
+
+#include "WorkitemReplication.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "WorkitemHandlerChooser.h"
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+//#define DEBUG_BB_MERGING
+//#define DUMP_RESULT_CFG
+//#define DEBUG_PR_REPLICATION
+
+#ifdef DUMP_RESULT_CFG
+#include "llvm/Analysis/CFGPrinter.h"
+#endif
+
+using namespace llvm;
+using namespace pocl;
+
+STATISTIC(ContextValues, "Number of SSA values which have to be context-saved");
+STATISTIC(ContextSize, "Context size per workitem in bytes");
+
+namespace {
+ static
+ RegisterPass<WorkitemReplication> X("workitemrepl", "Workitem replication pass");
+}
+
+char WorkitemReplication::ID = 0;
+
+void
+WorkitemReplication::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addRequired<LoopInfo>();
+
+// TODO - removed due to compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+#endif
+#endif
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+WorkitemReplication::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_FULL_REPLICATION)
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+#ifdef DUMP_RESULT_CFG
+ FunctionPass* cfgPrinter = createCFGPrinterPass();
+ cfgPrinter->runOnFunction(F);
+#endif
+
+ changed |= fixUndominatedVariableUses(DT, F);
+ return changed;
+}
+
+bool
+WorkitemReplication::ProcessFunction(Function &F)
+{
+ Module *M = F.getParent();
+
+// F.viewCFG();
+
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+ // Allocate space for workitem reference maps. Workitem 0 does
+ // not need it.
+ unsigned workitem_count = LocalSizeZ * LocalSizeY * LocalSizeX;
+
+ BasicBlockVector original_bbs;
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ if (!Barrier::hasBarrier(i))
+ original_bbs.push_back(i);
+ }
+
+ ParallelRegion::ParallelRegionVector* original_parallel_regions =
+ K->getParallelRegions(LI);
+
+ std::vector<SmallVector<ParallelRegion *, 8> > parallel_regions(workitem_count);
+
+ parallel_regions[0] = *original_parallel_regions;
+
+ /* Enable to get region identification printouts */
+#if 0
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ region->InjectRegionPrintF();
+ region->InjectVariablePrintouts();
+ }
+#endif
+
+ // Measure the required context (variables alive in more than one region).
+#ifdef LLVM_3_1
+ TargetData &TD = getAnalysis<TargetData>();
+#else
+ DataLayout &TD = getAnalysis<DataLayout>();
+#endif
+
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(), e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *pr = (*i);
+
+ for (ParallelRegion::iterator i2 = pr->begin(), e2 = pr->end();
+ i2 != e2; ++i2) {
+ BasicBlock *bb = (*i2);
+
+ for (BasicBlock::iterator i3 = bb->begin(), e3 = bb->end();
+ i3 != e3; ++i3) {
+ for (Value::use_iterator i4 = i3->use_begin(), e4 = i3->use_end();
+ i4 != e4; ++i4) {
+ // Instructions can only be used by instructions.
+ Instruction *user = cast<Instruction> (*i4);
+
+ if (find (pr->begin(), pr->end(), user->getParent()) ==
+ pr->end()) {
+ // User is not in the defining region.
+ ++ContextValues;
+ ContextSize += TD.getTypeAllocSize(i3->getType());
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Then replicate the ParallelRegions.
+ ValueToValueMapTy *const reference_map = new ValueToValueMapTy[workitem_count - 1];
+ for (int z = 0; z < LocalSizeZ; ++z) {
+ for (int y = 0; y < LocalSizeY; ++y) {
+ for (int x = 0; x < LocalSizeX ; ++x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ if (index == 0)
+ continue;
+
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *original = (*i);
+ ParallelRegion *replicated =
+ original->replicate
+ (reference_map[index - 1],
+ (".wi_" + Twine(x) + "_" + Twine(y) + "_" + Twine(z)));
+ if (AddWIMetadata)
+ replicated->AddIDMetadata(M->getContext(), x, y, z);
+ parallel_regions[index].push_back(replicated);
+#ifdef DEBUG_PR_REPLICATION
+ std::cerr << "### new replica:" << std::endl;
+ replicated->dump();
+#endif
+ }
+ }
+ }
+ }
+ if (AddWIMetadata) {
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *original = (*i);
+ original->AddIDMetadata(M->getContext(), 0, 0, 0);
+ }
+ }
+
+ for (int z = 0; z < LocalSizeZ; ++z) {
+ for (int y = 0; y < LocalSizeY; ++y) {
+ for (int x = 0; x < LocalSizeX ; ++x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) {
+ ParallelRegion *region = parallel_regions[index][i];
+ if (index != 0) {
+ region->remap(reference_map[index - 1]);
+ region->chainAfter(parallel_regions[index - 1][i]);
+ region->purge();
+ }
+ region->insertPrologue(x, y, z);
+ }
+ }
+ }
+ }
+
+ // Try to merge all workitem first block of each region
+ // together (for PHI predecessor correctness).
+ for (int z = LocalSizeZ - 1; z >= 0; --z) {
+ for (int y = LocalSizeY - 1; y >= 0; --y) {
+ for (int x = LocalSizeX - 1; x >= 0; --x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ if (index == 0)
+ continue;
+
+ for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) {
+ ParallelRegion *region = parallel_regions[index][i];
+ BasicBlock *entry = region->entryBB();
+
+ assert (entry != NULL);
+ BasicBlock *pred = entry->getUniquePredecessor();
+ assert (pred != NULL && "No unique predecessor.");
+#ifdef DEBUG_BB_MERGING
+ std::cerr << "### pred before merge into predecessor " << std::endl;
+ pred->dump();
+ std::cerr << "### entry before merge into predecessor " << std::endl;
+ entry->dump();
+#endif
+ movePhiNodes(entry, pred);
+ }
+ }
+ }
+ }
+
+ // Add the suffixes to original (wi_0_0_0) basic blocks.
+ for (BasicBlockVector::iterator i = original_bbs.begin(),
+ e = original_bbs.end();
+ i != e; ++i)
+ (*i)->setName((*i)->getName() + ".wi_0_0_0");
+
+ // Initialize local size variables (done at the end to avoid unnecessary
+ // replication).
+ K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+
+ delete [] reference_map;
+
+// F.viewCFG();
+
+ return true;
+}
+
diff --git a/src/llvmopencl/WorkitemReplication.h b/src/llvmopencl/WorkitemReplication.h
new file mode 100644
index 0000000..fb5d9d4
--- /dev/null
+++ b/src/llvmopencl/WorkitemReplication.h
@@ -0,0 +1,62 @@
+// Header for WorkitemReplication function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_REPLICATION_H
+#define _POCL_WORKITEM_REPLICATION_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <vector>
+#include "WorkitemHandler.h"
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemReplication : public pocl::WorkitemHandler {
+
+ public:
+ static char ID;
+
+ WorkitemReplication() : pocl::WorkitemHandler(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+
+ private:
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+
+ typedef std::set<llvm::BasicBlock *> BasicBlockSet;
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap;
+
+ virtual bool ProcessFunction(llvm::Function &F);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/config.h b/src/llvmopencl/config.h
new file mode 100644
index 0000000..1f1ed9d
--- /dev/null
+++ b/src/llvmopencl/config.h
@@ -0,0 +1 @@
+// Empty on purpose. Satifies includes from other files.
diff --git a/src/llvmopencl/pocl.h b/src/llvmopencl/pocl.h
new file mode 100644
index 0000000..ae6a66d
--- /dev/null
+++ b/src/llvmopencl/pocl.h
@@ -0,0 +1,49 @@
+/* pocl.h - global pocl declarations.
+
+ Copyright (c) 2011 Universidad Rey Juan Carlos
+ 2011-2014 Pekka Jääskeläinen / Tampere University of Technology
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/**
+ * @file pocl.h
+ *
+ * The declarations in this file are such that are used both in the
+ * libpocl implementation CL and the kernel compiler. Others should be
+ * moved to pocl_cl.h of lib/CL or under the kernel compiler dir.
+ * @todo Check if there are extra declarations here that could be moved.
+ */
+#ifndef POCL_H
+#define POCL_H
+
+/*
+ * During pocl kernel compiler transformations we use the fixed address
+ * space ids of clang's -ffake-address-space-map to mark the different
+ * address spaces to keep the processing target-independent. These
+ * are converted to the target's address space map (if any), in a final
+ * kernel compiler pass.
+ */
+#define POCL_ADDRESS_SPACE_PRIVATE 0
+#define POCL_ADDRESS_SPACE_GLOBAL 1
+#define POCL_ADDRESS_SPACE_LOCAL 2
+#define POCL_ADDRESS_SPACE_CONSTANT 3
+
+#endif /* POCL_H */
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
new file mode 100644
index 0000000..f3d34ab
--- /dev/null
+++ b/src/runtime/CMakeLists.txt
@@ -0,0 +1,59 @@
+# If building for ARM target host then set appropriate clang target
+# Needs to match what's used when using clang to build the kernel
+# See compiler.cpp
+if (HAWKING_BUILD)
+ set(HOST_TARGET -target spir-unknown-unknown-unknown)
+endif()
+
+# If Shamrock build, then we use the builtins.lib built in ../builtins
+if (SHAMROCK_BUILD)
+add_custom_command(
+ OUTPUT stdlib.c.bc.embed.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib)
+
+add_custom_target(generate_stdlib_c DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h)
+# otherwise, this stdlib.c is still being used (but is empty)
+else (SHAMROCK_BUILD)
+ set(CUSTOM_COMMAND
+ ${CLANG_EXECUTABLE} -c -emit-llvm -x cl -O2 ${HOST_TARGET} -nostdinc -fno-builtin)
+
+add_custom_command(
+ OUTPUT stdlib.c.bc
+ COMMAND ${CUSTOM_COMMAND}
+ -I${OCL_BUILTINS_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c
+ -I${CMAKE_CURRENT_BINARY_DIR}
+ -o ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h)
+
+add_custom_command(
+ OUTPUT stdlib.c.bc.embed.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc)
+
+add_custom_target(generate_stdlib_c DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h)
+
+add_custom_command(
+ OUTPUT builtins_def.h stdlib_def.h builtins_impl.h stdlib_impl.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py
+ ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def
+ ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py
+ ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def)
+
+add_custom_target(generate_builtins DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h)
+endif(SHAMROCK_BUILD)
diff --git a/src/runtime/builtins.def b/src/runtime/builtins.def
new file mode 100644
index 0000000..b94807b
--- /dev/null
+++ b/src/runtime/builtins.def
@@ -0,0 +1,301 @@
+def vecf : float2 float3 float4 float8 float16
+def veci : int2 int3 int4 int8 int16
+
+def vec : $vecf $veci
+def gentype : float $vecf
+
+// gentype acos(gentype)
+// REPL is defined in src/core/cpu/builtins.cpp
+//native float acos float : x:float
+ //return std::acos(x);
+//end
+
+//native $type acos $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::acos(x[i]);
+//end
+
+// gentype acosh(gentype)
+//native float acosh float : x:float
+ //return boost::math::acosh(x);
+//end
+
+//native $type acosh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::acosh(x[i]);
+//end
+
+// gentype acospi(gentype)
+//func float acospi float : x:float
+ //return acos(x) / M_PI;
+//end
+
+//native $type acospi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::acos(x[i]) / M_PI;
+//end
+
+// gentype asin (gentype)
+//native float asin float : x:float
+ //return std::asin(x);
+//end
+
+//native $type asin $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::asin(x[i]);
+//end
+
+// gentype asinh (gentype)
+//native float asinh float : x:float
+ //return boost::math::asinh(x);
+//end
+
+//native $type asinh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::asinh(x[i]);
+//end
+
+// gentype asinpi (gentype x)
+//func float asinpi float : x:float
+ //return asin(x) / M_PI;
+//end
+
+//native $type asinpi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::asin(x[i]) / M_PI;
+//end
+
+// gentype atan (gentype y_over_x)
+//native float atan float : y_over_x:float
+ //return std::atan(y_over_x);
+//end
+
+//native $type atan $vecf : y_over_x:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y_over_x[i]);
+//end
+
+// gentype atan2 (gentype y, gentype x)
+//func float atan2 float : x:float y:float
+ //return atan(y / x);
+//end
+
+//native $type atan2 $vecf : x:$type y:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y[i] / x[i]);
+//end
+
+// gentype atanh (gentype)
+//native float atanh float : x:float
+ //return boost::math::atanh(x);
+//end
+
+//native $type atanh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::atanh(x[i]);
+//end
+
+// gentype atanpi (gentype x)
+//func float atanpi float : x:float
+ //return atan(x) / M_PI;
+//end
+
+//native $type atanpi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(x[i]) / M_PI;
+//end
+
+// gentype atan2pi (gentype y, gentype x)
+//func float atan2pi float : x:float y:float
+ //return atan2(y, x) / M_PI;
+//end
+//
+//native $type atan2pi $vecf : x:$type y:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y[i] / x[i]) / M_PI;
+//end
+
+// gentype cbrt (gentype)
+//native float cbrt float : x:float
+ //return boost::math::cbrt(x);
+//end
+//
+//native $type cbrt $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::cbrt(x[i]);
+//end
+
+// gentype ceil (gentype)
+//native float ceil float : x:float
+ //return std::ceil(x);
+//end
+//
+//native $type ceil $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::ceil(x[i]);
+//end
+
+// gentype copysign (gentype x, gentype y)
+//func $type copysign $gentype : x:$type y:$type
+ //return (
+ //(x < 0.0f & y > 0.0f) |
+ //(x > 0.0f & y < 0.0f)
+ //? -x : x);
+//end
+
+//gentype cos (gentype)
+//native float cos float : x:float
+ //return std::cos(x);
+//end
+
+//native $type cos $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::cos(x[i]);
+//end
+
+// gentype cosh (gentype)
+//native float cosh float : x:float
+ //return std::cosh(x);
+//end
+
+//native $type cosh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::cosh(x[i]);
+//end
+
+// gentype cospi (gentype x)
+//func $type cospi $gentype : x:$type
+ //return cos(x * (float)M_PI);
+//end
+
+// TODO: gentype erfc (gentype)
+// TODO: gentype erf (gentype)
+
+// gentype exp(gentype x)
+//native float exp float : x:float
+ //return std::exp(x);
+//end
+//
+//native $type exp $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::exp(x[i]);
+//end
+//
+// gentype exp2(gentype x)
+//native float exp2 float : x:float
+ //return exp2f(x);
+//end
+//
+//native $type exp2 $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = exp2f(x[i]);
+//end
+//
+//// gentype exp10(gentype x)
+//native float exp10 float : x:float
+ //return exp10f(x);
+//end
+//
+//native $type exp10 $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = exp10f(x[i]);
+//end
+//
+//// gentype expm1(gentype x)
+//func $type expm1 $gentype : x:$type
+ //return exp(x) - 1.0f;
+//end
+//
+//// gentype fdim(x, y)
+//func $type fdim $gentype : x:$type y:$type
+ //return (x > y ? x - y : 0.0f);
+//end
+//
+// gentype floor(gentype x) (TODO: SSE fast path : float->int->float)
+//native float floor float : x:float
+ //return std::floor(x);
+//end
+//
+//native $type floor $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::floor(x[i]);
+//end
+//
+//// gentype fma(a, b, c) : a*b + c (TODO)
+//func $type fma $gentype : a:$type b:$type c:$type
+ //return (a * b) + c;
+//end
+//
+//// gentype trunc(x)
+//native float trunc float : x:float
+ //return boost::math::trunc(x);
+//end
+//
+//native $type trunc $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::trunc(x[i]);
+//end
+//
+//// gentype fmod(x, y)
+//func $type fmod $gentype : x:$type y:$type
+ //return x - y * trunc(x / y);
+//end
+//
+// gentype fract(gentype x, gentype *iptr)
+//func $type fract $gentype : x:$type iptr:*$type
+ //*iptr = floor(x);
+ //return fmin(x - *iptr, 0x1.fffffep-1f);
+//end
+
+// gentype frexp(gentype x, intn *exp)
+//native float frexp float : x:float exp:*int
+ //return std::frexp(x, exp);
+//end
+//
+//native $type frexp $vecf : x:$type exp:*int$vecdim
+ //REPL($vecdim)
+ //result[i] = std::frexp(x[i], &exp[i]);
+//end
+//
+//// gentype sqrt(gentype x)
+//native float sqrt float : x:float
+ //return std::sqrt(x);
+//end
+//
+//native double sqrt double : x:double
+ //return std::sqrt(x);
+//end
+//
+//native double log double : x:double
+ //return std::log(x);
+//end
+//
+//native $type sqrt $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::sqrt(x[i]);
+//end
+//
+//// gentype hypot(gentype x, gentype y)
+//func $type hypot $gentype : x:$type y:$type
+ //return sqrt(x*x + y*y);
+//end
+
+// intn ilogb(gentype x)
+//native int ilogb float : x:float
+ //return ilogb(x);
+//end
+
+//native int$vecdim ilogb $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = ilogb(x[i]);
+//end
+
+// gentype ldexp(gentype x, intn n)
+//native float ldexp float : x:float n:int
+ //return std::ldexp(x, n);
+//end
+
+//native $type ldexp $vecf : x:$type n:int$vecdim
+ //REPL($vecdim)
+ //result[i] = std::ldexp(x[i], n[i]);
+//end
diff --git a/src/runtime/builtins.py b/src/runtime/builtins.py
new file mode 100755
index 0000000..909fee8
--- /dev/null
+++ b/src/runtime/builtins.py
@@ -0,0 +1,380 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+# Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# builtins.py <def> <outdir>
+
+import sys
+
+class Function:
+ class Arg:
+ def __init__(self, name, t):
+ self.name = name
+ self.t = t
+
+ KIND_BUILTINS_IMPL = 0 # static function in builtins.cpp
+ KIND_BUILTINS_DEF = 1 # if (name == '__cpu_$name') return (void *)&name;
+ KIND_STDLIB_IMPL = 2 # OpenCL C function in stdlib.c
+ KIND_STDLIB_DEF = 3 # Header in stdlib.h
+ KIND_STDLIB_STUB = 4 # OpenCL C stub in stdlib.c: calls __cpu_$name
+ KIND_STDLIB_STUB_DEF = 5 # __cpu_$name declared in stdlib.c
+
+ def __init__(self, name, native):
+ self.name = name
+ self.native = native
+
+ self.args = [] # Array <Arg>
+ self.types = [] # Array <str>
+ self.return_type = ''
+ self.body = ''
+
+ def set_return_type(self, ty):
+ self.return_type = ty
+
+ def append_body(self, body):
+ self.body += body
+
+ def add_arg(self, name, ty):
+ self.args.append(self.Arg(name, ty))
+
+ def add_type(self, ty):
+ self.types.append(ty)
+
+ def mangled_name(self, current_type):
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ rs = return_type + '_' + self.name
+ first = True
+
+ for a in self.args:
+ if first:
+ rs += '_'
+ first = False
+
+ arg_type = self.process_type_name(current_type, a.t)
+ rs += arg_type.replace('*', 'p')
+
+ return rs
+
+ def process_type_name(self, current_type, type_name):
+ # Current vector dimension
+ vecdim = '1'
+
+ if current_type[-1].isdigit():
+ if current_type[-2].isdigit():
+ vecdim = current_type[-2:]
+ else:
+ vecdim = current_type[-1]
+
+ # $vecdim expansion
+ return type_name.replace('$vecdim', vecdim).replace('$type', current_type)
+
+ def arg_list(self, current_type, handle_first_arg):
+ rs = ''
+ first = True
+ append_arg = None
+
+ # We may need a first "result" arg
+ if handle_first_arg:
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ if return_type[-1].isdigit():
+ # Return is a vector
+ append_arg = self.Arg('result', return_type)
+
+ if append_arg:
+ args = [append_arg] + self.args
+ else:
+ args = self.args
+
+ for arg in args:
+ # Resolve type
+ arg_type = self.process_type_name(current_type, arg.t)
+
+ if arg_type[0] == '*':
+ arg_ptr = True
+ arg_type = arg_type[1:]
+ else:
+ arg_ptr = False
+
+ # We need to pass vector arguments as pointers
+ arg_vector = False
+ if handle_first_arg:
+ arg_vector = arg_type[-1].isdigit()
+ arg_type = arg_type.rstrip('0123456789')
+
+ # Build the string
+ if not first:
+ rs += ', '
+ first = False
+
+ rs += arg_type + ' '
+
+ if arg_vector or arg_ptr:
+ rs += '*'
+
+ rs += arg.name
+
+ return rs
+
+ def write(self, current_type, kind):
+ # Template:
+ # (static) $ret_type $name($args) {
+ # $body
+ # }
+ rs = ''
+
+ if kind == self.KIND_BUILTINS_IMPL:
+ rs = 'static '
+ elif kind == self.KIND_BUILTINS_DEF:
+ rs += ' else if (name == "__cpu_' + self.mangled_name(current_type) + '")\n'
+ rs += ' return (void *)&' + self.mangled_name(current_type) + ';\n'
+ return rs
+
+ # Calculate return type
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ if (kind == self.KIND_BUILTINS_IMPL or kind == self.KIND_STDLIB_STUB_DEF) \
+ and return_type[-1].isdigit():
+ return_type = 'void' # We'll use a 'result' argument
+
+ rs += return_type + ' '
+
+ # Append mangled name if needed
+ if kind == self.KIND_BUILTINS_IMPL:
+ rs += self.mangled_name(current_type)
+ elif kind == self.KIND_STDLIB_STUB_DEF:
+ rs += '__cpu_' + self.mangled_name(current_type)
+ else:
+ # No need to mangle the name, but add OVERLOAD
+ rs += '_CLC_OVERLOAD ' + self.name
+
+ # Print function args
+ rs += '('
+ rs += self.arg_list(current_type, kind == self.KIND_BUILTINS_IMPL or \
+ kind == self.KIND_STDLIB_STUB_DEF)
+ rs += ')'
+
+ # If only a declaration, end it
+ if kind == self.KIND_STDLIB_DEF or kind == self.KIND_STDLIB_STUB_DEF:
+ rs += ';\n'
+ return rs
+
+ # Add the body
+ rs += '\n{\n'
+
+ if kind == self.KIND_STDLIB_STUB:
+ # Special body : call __cpu_$name
+ return_is_vector = return_type[-1].isdigit()
+ if return_is_vector:
+ # Need to create a temporary
+ rs += ' ' + return_type + ' result;\n'
+ rs += '\n'
+
+ # Call the cpu stub
+ rs += ' '
+ if not return_is_vector:
+ rs += 'return '
+
+ rs += '__cpu_' + self.mangled_name(current_type) + '('
+
+ # Pass the result if needed
+ first = True
+ if return_is_vector:
+ rs += '(' + return_type.rstrip('0123456789') + ' *)&result'
+ first = False
+
+ # Append the args
+ for arg in self.args:
+ # Resolve type
+ arg_type = self.process_type_name(current_type, arg.t)
+
+ arg_ptr = False
+ if arg_type[0] == '*':
+ arg_type = arg_type[1:]
+ arg_ptr = True
+
+ arg_vector = arg_type[-1].isdigit()
+
+ if not first:
+ rs += ', '
+ first = False
+
+ # We need to pass vector arguments as pointers
+ if arg_vector:
+ rs += '(' + arg_type.rstrip('0123456789') + ' *)'
+ if not arg_ptr:
+ rs += '&'
+
+ rs += arg.name
+
+ # End the call
+ rs += ');\n'
+
+ if return_is_vector:
+ rs += '\n return result;\n'
+
+ rs += '}\n\n'
+ else:
+ # Simply copy the body
+ vecdim = '1'
+
+ if current_type[-1].isdigit():
+ if current_type[-2].isdigit():
+ vecdim = current_type[-2:]
+ else:
+ vecdim = current_type[-1]
+
+ rs += self.body.replace('$type', current_type) \
+ .replace('$vecdim', vecdim)
+ rs += '\n}\n\n'
+
+ return rs
+
+class Generator:
+ builtins_impl_file = 'builtins_impl.h' # static functions
+ builtins_def_file = 'builtins_def.h' # if () in getBuiltin
+ stdlib_impl_file = 'stdlib_impl.h' # stdlib.c functions
+ stdlib_def_file = 'stdlib_def.h' # stdlib.h definitions
+
+ def __init__(self, out_path):
+ self.out_path = out_path
+
+ # Buffers
+ self.builtins_impl_buffer = ''
+ self.builtins_def_buffer = ''
+ self.stdlib_impl_buffer = ''
+ self.stdlib_def_buffer = ''
+
+ def add_function(self, function):
+ for t in function.types:
+ if function.native:
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB_DEF)
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB)
+ self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF)
+ self.builtins_impl_buffer += function.write(t, function.KIND_BUILTINS_IMPL)
+ self.builtins_def_buffer += function.write(t, function.KIND_BUILTINS_DEF)
+ else:
+ self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF)
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_IMPL)
+
+ def write(self):
+ of = open(self.out_path + '/' + self.stdlib_def_file, 'w')
+ of.write(self.stdlib_def_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.stdlib_impl_file, 'w')
+ of.write(self.stdlib_impl_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.builtins_def_file, 'w')
+ of.write(self.builtins_def_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.builtins_impl_file, 'w')
+ of.write(self.builtins_impl_buffer)
+ of.close()
+
+class Parser:
+ def __init__(self, generator, def_file_name):
+ self.generator = generator
+ self.def_file_name = def_file_name
+
+ self.defs = {}
+
+ def replace_variable(self, token):
+ result = []
+
+ if token[0] == '$':
+ for tok in self.defs[token[1:]]:
+ result.extend(self.replace_variable(tok))
+ else:
+ result.append(token)
+
+ return result
+
+ def parse(self):
+ def_file = open(self.def_file_name, 'rb')
+ current_function = None
+
+ for line in def_file:
+ if current_function:
+ # End if we encounter an end
+ if line.startswith('end'):
+ self.generator.add_function(current_function)
+ current_function = None
+ else:
+ # Add a line to the body
+ current_function.append_body(line)
+ else:
+ line = line.strip()
+ tokens = line.split(' ')
+ tok = tokens[0]
+
+ if tok == 'def':
+ # A definition : def <variable> : [values]
+ name = tokens[1]
+ values = []
+
+ for token in tokens[3:]:
+ values.extend(self.replace_variable(token))
+
+ self.defs[name] = values
+ elif tok == 'func' or tok == 'native':
+ # Function : func|native <ret_type> <name> [types] : [args]
+ current_function = Function(tokens[2], \
+ tokens[0] == 'native')
+
+ current_function.set_return_type(tokens[1])
+
+ # Explore the types and args
+ in_types = True
+
+ for token in tokens[3:]:
+ if token == ':':
+ in_types = False
+ elif in_types:
+ for ty in self.replace_variable(token):
+ current_function.add_type(ty)
+ else:
+ # Parameters
+ parts = token.split(':')
+ current_function.add_arg(parts[0], parts[1])
+
+ def_file.close()
+
+if __name__ == '__main__':
+ def_file = sys.argv[1]
+ out_dir = sys.argv[2]
+
+ gen = Generator(out_dir)
+ parser = Parser(gen, def_file)
+
+ parser.parse()
+ gen.write()
diff --git a/src/runtime/embed.py b/src/runtime/embed.py
new file mode 100755
index 0000000..e3aca9d
--- /dev/null
+++ b/src/runtime/embed.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+# #!/usr/local/bin/python2.6-2.6.4
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# embed.py <outfile> <filenames..>
+# <filenames> => <outfile>
+
+import sys
+
+outfile = open(sys.argv[1], 'w')
+name = sys.argv[1].split('/')[-1].replace('.embed.h', '').replace('.', '_')
+
+data = ''
+
+for i in xrange(len(sys.argv) - 1):
+ infile = open(sys.argv[i + 1], 'rb')
+ data += infile.read()
+
+# Header
+outfile.write('#ifndef __%s__\n' % name.upper())
+outfile.write('#define __%s__\n' % name.upper())
+outfile.write('\n')
+outfile.write('const char embed_%s[] =\n' % name)
+
+# Write it in chunks of 80 chars :
+# | "\x00..." (4+1+1 + 4*chars ==> chars = 18)
+index = 0
+
+for c in data:
+ if index == 0:
+ outfile.write(' "')
+
+ outfile.write('\\x%s' % ('%x' % ord(c)).rjust(2, '0'))
+ index += 1
+
+ if index == 18:
+ index = 0
+ outfile.write('"\n')
+
+# We may need to terminate a line
+if index != 0:
+ outfile.write('";\n')
+else:
+ outfile.write(';\n') # Alone on its line, poor semicolon
+
+# Footer
+outfile.write('\n')
+outfile.write('#endif\n')
+
+infile.close()
+outfile.close()
diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c
new file mode 100644
index 0000000..9b115df
--- /dev/null
+++ b/src/runtime/stdlib.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+int debug(const char *format, ...);
+
+/* WARNING: Due to some device-specific things in stdlib.h, the bitcode stdlib
+ * must only be used by CPUDevice, as it's targeted to the host CPU at Clover's
+ * compilation! */
+
+/*
+ * Built-in functions generated by src/runtime/builtins.py
+ */
+
+#include <clc.h>
+#include <stdlib_impl.h>