aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGil Pitney <gil.pitney@linaro.org>2014-10-28 18:00:42 -0700
committerGil Pitney <gil.pitney@linaro.org>2014-10-28 18:00:42 -0700
commit61b2c94d9e64758e55730be6a3fc9006c171db85 (patch)
treef564f09ebf93ba293dfa225bd374df6f1f37aa01
Initial Commit: Based on TI OpenCL v0.8, originally based on clover.shamrock_v0.8
This is a continuation of the clover OpenCL project: http://people.freedesktop.org/~steckdenis/clover based on the contributions from Texas Instruments for Keystone II DSP device: git.ti.com/opencl and adding contributions from Linaro for ARM CPU-only support. See README.txt for more info, and build instructions. Signed-off-by: Gil Pitney <gil.pitney@linaro.org>
-rw-r--r--.gitignore17
-rw-r--r--CMakeLists.txt194
-rw-r--r--CREDITS7
-rw-r--r--Doxyfile1692
-rw-r--r--README.txt129
-rw-r--r--TODO29
-rwxr-xr-xbin/init_global_shared_membin0 -> 839220 bytes
-rwxr-xr-xbin/oclenv26
-rw-r--r--clocl/.gitignore1
-rw-r--r--clocl/CMakeLists.txt14
-rw-r--r--clocl/Makefile100
-rw-r--r--clocl/compiler.cpp270
-rw-r--r--clocl/compiler.h126
-rw-r--r--clocl/file_manip.cpp46
-rw-r--r--clocl/file_manip.h12
-rw-r--r--clocl/main.cpp396
-rw-r--r--clocl/options.cpp223
-rw-r--r--clocl/options.h24
-rw-r--r--clocl/program.cpp189
-rw-r--r--cmake/CMakeARMToolChain.txt53
-rw-r--r--cmake/modules/FindCheck.cmake57
-rw-r--r--cmake/modules/FindClang.cmake73
-rw-r--r--cmake/modules/FindLLVM.cmake168
-rw-r--r--cmem/Makefile49
-rw-r--r--cmem/cmem.c668
-rw-r--r--cmem/cmem.h92
-rw-r--r--cmem/cmemcfg.h55
-rwxr-xr-xcmem/load.sh11
-rwxr-xr-xcmem/unload.sh4
-rw-r--r--doc/Makefile15
-rw-r--r--doc/opencl-1.1.pdfbin0 -> 2932434 bytes
-rw-r--r--doc/opencl-cplusplus-1.1.pdfbin0 -> 882024 bytes
-rw-r--r--doc/opencl_readme.pdfbin0 -> 134649 bytes
-rw-r--r--doc/opencl_readme.tex668
-rw-r--r--doc/tiStk2cRgb.pdfbin0 -> 30002 bytes
-rw-r--r--include/CL/cl.h998
-rw-r--r--include/CL/cl.hpp4014
-rw-r--r--include/CL/cl_d3d10.h126
-rw-r--r--include/CL/cl_ext.h228
-rw-r--r--include/CL/cl_gl.h155
-rw-r--r--include/CL/cl_gl_ext.h69
-rw-r--r--include/CL/cl_platform.h1198
-rw-r--r--include/CL/opencl.h54
-rw-r--r--include/clc.h1939
-rw-r--r--include/cpu.h262
-rw-r--r--include/dsp.h490
-rw-r--r--init/Makefile25
-rw-r--r--init/init.cmd83
-rw-r--r--init/main.c485
-rw-r--r--init_global_shared_mem/Makefile29
-rw-r--r--init_global_shared_mem/README1
-rw-r--r--init_global_shared_mem/init_global_shared_mem.c73
-rw-r--r--opencl-manifest.docxbin0 -> 47893 bytes
-rwxr-xr-xopencl-manifest.pdfbin0 -> 199086 bytes
-rw-r--r--readme_shannon.txt369
-rw-r--r--scripts/20-c6678.rules6
-rw-r--r--scripts/c6678_udev.sh12
-rwxr-xr-xscripts/install.sh77
-rwxr-xr-xscripts/uninstall.sh43
-rw-r--r--src/.gitignore3
-rw-r--r--src/CMakeLists.txt241
-rw-r--r--src/api/api_command.cpp130
-rw-r--r--src/api/api_context.cpp149
-rw-r--r--src/api/api_device.cpp78
-rw-r--r--src/api/api_enqueue.cpp823
-rw-r--r--src/api/api_event.cpp190
-rw-r--r--src/api/api_flush.cpp57
-rw-r--r--src/api/api_gl.cpp118
-rw-r--r--src/api/api_kernel.cpp219
-rw-r--r--src/api/api_memory.cpp418
-rw-r--r--src/api/api_platform.cpp89
-rw-r--r--src/api/api_profiling.cpp50
-rw-r--r--src/api/api_program.cpp307
-rw-r--r--src/api/api_sampler.cpp109
-rw-r--r--src/builtins/CMakeLists.txt33
-rw-r--r--src/builtins/Makefile24
-rw-r--r--src/builtins/README.txt13
-rw-r--r--src/builtins/abs.cl33
-rw-r--r--src/builtins/abs_diff.cl72
-rw-r--r--src/builtins/add_sat.cl37
-rw-r--r--src/builtins/all.cl43
-rw-r--r--src/builtins/any.cl43
-rw-r--r--src/builtins/atomics.cl.broken558
-rw-r--r--src/builtins/bitselect.cl92
-rw-r--r--src/builtins/clamp.cl43
-rw-r--r--src/builtins/clz.cl37
-rw-r--r--src/builtins/convert.cl36122
-rw-r--r--src/builtins/cross.cl59
-rw-r--r--src/builtins/degrees.cl41
-rw-r--r--src/builtins/dot.cl41
-rw-r--r--src/builtins/fract.cl93
-rw-r--r--src/builtins/frexp.cl76
-rw-r--r--src/builtins/hadd.cl44
-rw-r--r--src/builtins/length.cl109
-rw-r--r--src/builtins/lgamma_r.cl80
-rw-r--r--src/builtins/mad_sat.cl37
-rw-r--r--src/builtins/math.cl151
-rw-r--r--src/builtins/max.cl46
-rw-r--r--src/builtins/misc.cl36
-rw-r--r--src/builtins/mix.cl42
-rw-r--r--src/builtins/modf.cl81
-rw-r--r--src/builtins/mul_hi.cl102
-rw-r--r--src/builtins/relationals.cl64
-rw-r--r--src/builtins/remquo.cl127
-rw-r--r--src/builtins/rotate.cl58
-rw-r--r--src/builtins/select.cl53
-rw-r--r--src/builtins/shuffle.cl215
-rw-r--r--src/builtins/sign.cl43
-rw-r--r--src/builtins/sincos.cl128
-rw-r--r--src/builtins/smoothstep.cl77
-rw-r--r--src/builtins/step.cl43
-rw-r--r--src/builtins/sub_sat.cl37
-rw-r--r--src/builtins/upsample.cl56
-rw-r--r--src/builtins/vload.cl127
-rw-r--r--src/core/commandqueue.cpp1018
-rw-r--r--src/core/commandqueue.h494
-rw-r--r--src/core/compiler.cpp342
-rw-r--r--src/core/compiler.h138
-rw-r--r--src/core/config.h9
-rw-r--r--src/core/config.h.cmake9
-rw-r--r--src/core/context.cpp236
-rw-r--r--src/core/context.h104
-rw-r--r--src/core/cpu/buffer.cpp128
-rw-r--r--src/core/cpu/buffer.h77
-rw-r--r--src/core/cpu/builtins.cpp503
-rw-r--r--src/core/cpu/builtins.h144
-rw-r--r--src/core/cpu/device.cpp675
-rw-r--r--src/core/cpu/device.h113
-rw-r--r--src/core/cpu/kernel.cpp734
-rw-r--r--src/core/cpu/kernel.h325
-rw-r--r--src/core/cpu/program.cpp174
-rw-r--r--src/core/cpu/program.h102
-rw-r--r--src/core/cpu/sampler.cpp769
-rw-r--r--src/core/cpu/worker.cpp274
-rw-r--r--src/core/cpu/worker.h45
-rw-r--r--src/core/deviceinterface.h352
-rw-r--r--src/core/dsp/buffer.cpp149
-rw-r--r--src/core/dsp/buffer.h61
-rw-r--r--src/core/dsp/cmem.cpp271
-rw-r--r--src/core/dsp/cmem.h64
-rw-r--r--src/core/dsp/core_scheduler.h62
-rw-r--r--src/core/dsp/database.h112
-rw-r--r--src/core/dsp/device.cpp1135
-rw-r--r--src/core/dsp/device.h151
-rw-r--r--src/core/dsp/driver.cpp34
-rw-r--r--src/core/dsp/driver.h100
-rw-r--r--src/core/dsp/driver_hawking.cpp451
-rw-r--r--src/core/dsp/driver_shannon.cpp313
-rw-r--r--src/core/dsp/dspheap.h200
-rw-r--r--src/core/dsp/dspmem.h59
-rw-r--r--src/core/dsp/genfile_cache.cpp94
-rw-r--r--src/core/dsp/genfile_cache.h101
-rw-r--r--src/core/dsp/kernel.cpp718
-rw-r--r--src/core/dsp/kernel.h119
-rw-r--r--src/core/dsp/mailbox.h114
-rw-r--r--src/core/dsp/memmap.h120
-rw-r--r--src/core/dsp/message.h115
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c200
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h53
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h160
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c1101
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h30
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp825
-rw-r--r--src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h101
-rw-r--r--src/core/dsp/ocl_load/CMakeLists.txt26
-rw-r--r--src/core/dsp/ocl_load/DLOAD/ArrayList.c122
-rw-r--r--src/core/dsp/ocl_load/DLOAD/ArrayList.h92
-rw-r--r--src/core/dsp/ocl_load/DLOAD/Queue.h194
-rw-r--r--src/core/dsp/ocl_load/DLOAD/Stack.h155
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload.c3534
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload.h334
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload_endian.c151
-rw-r--r--src/core/dsp/ocl_load/DLOAD/dload_endian.h58
-rw-r--r--src/core/dsp/ocl_load/DLOAD/elf32.c652
-rw-r--r--src/core/dsp/ocl_load/DLOAD/elf32.h756
-rw-r--r--src/core/dsp/ocl_load/DLOAD/relocate.h64
-rw-r--r--src/core/dsp/ocl_load/DLOAD/symtab.h72
-rw-r--r--src/core/dsp/ocl_load/DLOAD/util.h89
-rw-r--r--src/core/dsp/ocl_load/DLOAD/version.h63
-rw-r--r--src/core/dsp/ocl_load/DLOAD/virtual_targets.h90
-rw-r--r--src/core/dsp/ocl_load/DLOAD_API/api_version_change.log33
-rw-r--r--src/core/dsp/ocl_load/DLOAD_API/dload_api.h700
-rw-r--r--src/core/dsp/ocl_load/DLOAD_SYM/symtab.c417
-rw-r--r--src/core/dsp/ocl_load/README8
-rw-r--r--src/core/dsp/ocl_load/Stack.h182
-rw-r--r--src/core/dsp/ocl_load/ocl_load.c139
-rw-r--r--src/core/dsp/program.cpp633
-rw-r--r--src/core/dsp/program.h92
-rw-r--r--src/core/dsp/shmem.cpp539
-rw-r--r--src/core/dsp/shmem.h134
-rw-r--r--src/core/dsp/source_cache.h114
-rw-r--r--src/core/dsp/u_concurrent_map.h137
-rw-r--r--src/core/dsp/u_concurrent_stack.h124
-rw-r--r--src/core/dsp/u_lockable.h109
-rw-r--r--src/core/dsp/u_locks_pthread.h137
-rw-r--r--src/core/dsp/utils.h85
-rw-r--r--src/core/dsp/wga.cpp464
-rw-r--r--src/core/dsp/wga.h72
-rw-r--r--src/core/dsp/worker.cpp519
-rw-r--r--src/core/events.cpp1519
-rw-r--r--src/core/events.h718
-rw-r--r--src/core/icd.cpp145
-rw-r--r--src/core/icd.h44
-rw-r--r--src/core/kernel.cpp637
-rw-r--r--src/core/kernel.h326
-rw-r--r--src/core/memobject.cpp960
-rw-r--r--src/core/memobject.h302
-rw-r--r--src/core/object.cpp115
-rw-r--r--src/core/object.h133
-rw-r--r--src/core/platform.cpp227
-rw-r--r--src/core/platform.h65
-rw-r--r--src/core/program.cpp846
-rw-r--r--src/core/program.h250
-rw-r--r--src/core/propertylist.h119
-rw-r--r--src/core/sampler.cpp247
-rw-r--r--src/core/sampler.h115
-rw-r--r--src/core/util.cpp68
-rw-r--r--src/core/util.h41
-rw-r--r--src/llvmopencl/AllocasToEntry.cc74
-rw-r--r--src/llvmopencl/AllocasToEntry.h49
-rw-r--r--src/llvmopencl/Barrier.h121
-rw-r--r--src/llvmopencl/BarrierBlock.cc73
-rw-r--r--src/llvmopencl/BarrierBlock.h44
-rw-r--r--src/llvmopencl/BarrierTailReplication.cc421
-rw-r--r--src/llvmopencl/BarrierTailReplication.h85
-rw-r--r--src/llvmopencl/BreakConstantGEPs.cpp326
-rw-r--r--src/llvmopencl/BreakConstantGEPs.h57
-rw-r--r--src/llvmopencl/CanonicalizeBarriers.cc214
-rw-r--r--src/llvmopencl/CanonicalizeBarriers.h56
-rw-r--r--src/llvmopencl/Flatten.cc158
-rw-r--r--src/llvmopencl/Flatten.h51
-rw-r--r--src/llvmopencl/GenerateHeader.cc336
-rw-r--r--src/llvmopencl/ImplicitLoopBarriers.cc178
-rw-r--r--src/llvmopencl/ImplicitLoopBarriers.h44
-rw-r--r--src/llvmopencl/IsolateRegions.cc175
-rw-r--r--src/llvmopencl/IsolateRegions.h44
-rw-r--r--src/llvmopencl/Kernel.cc297
-rw-r--r--src/llvmopencl/Kernel.h54
-rw-r--r--src/llvmopencl/LLVMUtils.cc90
-rw-r--r--src/llvmopencl/LLVMUtils.h38
-rw-r--r--src/llvmopencl/LoopBarriers.cc194
-rw-r--r--src/llvmopencl/LoopBarriers.h47
-rw-r--r--src/llvmopencl/Makefile.am53
-rw-r--r--src/llvmopencl/Makefile.in822
-rw-r--r--src/llvmopencl/PHIsToAllocas.cc144
-rw-r--r--src/llvmopencl/PHIsToAllocas.h56
-rw-r--r--src/llvmopencl/ParallelRegion.cc809
-rw-r--r--src/llvmopencl/ParallelRegion.h127
-rw-r--r--src/llvmopencl/TargetAddressSpaces.cc220
-rw-r--r--src/llvmopencl/TargetAddressSpaces.h54
-rw-r--r--src/llvmopencl/VariableUniformityAnalysis.cc382
-rw-r--r--src/llvmopencl/VariableUniformityAnalysis.h70
-rw-r--r--src/llvmopencl/WIVectorize.cc3252
-rw-r--r--src/llvmopencl/WorkItemAliasAnalysis.cc119
-rw-r--r--src/llvmopencl/WorkItemAliasAnalysis.h75
-rw-r--r--src/llvmopencl/Workgroup.cc619
-rw-r--r--src/llvmopencl/Workgroup.h48
-rw-r--r--src/llvmopencl/WorkitemHandler.cc278
-rw-r--r--src/llvmopencl/WorkitemHandler.h73
-rw-r--r--src/llvmopencl/WorkitemHandlerChooser.cc111
-rw-r--r--src/llvmopencl/WorkitemHandlerChooser.h52
-rw-r--r--src/llvmopencl/WorkitemLoops.cc1061
-rw-r--r--src/llvmopencl/WorkitemLoops.h112
-rw-r--r--src/llvmopencl/WorkitemReplication.cc308
-rw-r--r--src/llvmopencl/WorkitemReplication.h62
-rw-r--r--src/llvmopencl/config.h1
-rw-r--r--src/llvmopencl/pocl.h49
-rw-r--r--src/runtime/CMakeLists.txt59
-rw-r--r--src/runtime/builtins.def301
-rwxr-xr-xsrc/runtime/builtins.py380
-rwxr-xr-xsrc/runtime/embed.py76
-rw-r--r--src/runtime/stdlib.c40
-rw-r--r--tests/CMakeLists.txt30
-rw-r--r--tests/basic_test_failures.lst412
-rw-r--r--tests/test_builtins.cpp419
-rw-r--r--tests/test_builtins.h44
-rw-r--r--tests/test_commandqueue.cpp1027
-rw-r--r--tests/test_commandqueue.h44
-rw-r--r--tests/test_context.cpp263
-rw-r--r--tests/test_context.h44
-rw-r--r--tests/test_device.cpp169
-rw-r--r--tests/test_device.h44
-rw-r--r--tests/test_kernel.cpp321
-rw-r--r--tests/test_kernel.h44
-rw-r--r--tests/test_mem.cpp346
-rw-r--r--tests/test_mem.h44
-rw-r--r--tests/test_platform.cpp117
-rw-r--r--tests/test_platform.h44
-rw-r--r--tests/test_program.cpp247
-rw-r--r--tests/test_program.h44
-rw-r--r--tests/tests.c81
-rw-r--r--tests/vector_args.bcbin0 -> 1704 bytes
-rw-r--r--tests/vector_args.cl38
-rw-r--r--tests/vector_args.spir.ll45
-rw-r--r--tests/vector_args.x86.ll45
-rw-r--r--util/CMakeLists.txt16
-rw-r--r--util/decode_error.cpp84
-rw-r--r--util/ocl_util.h40
-rw-r--r--util/read_binary.cpp46
-rw-r--r--util/report_timing.cpp90
300 files changed, 107313 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..15476a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+*~
+*.syms
+CMakeCache.txt
+CMakeFiles/
+build/
+cmake_install.cmake
+/.settings/
+/.kdev4/
+/nbproject/private/
+*.obj
+*.out
+*.map
+*.hex
+tags
+build.log
+*.embed.h
+init/main.asm
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..b272c54
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,194 @@
+# Usage: cmake <project_src_dir> <optional_defines>*
+# where <optional_defines*> are:
+# -DPROJECT=shamrock | shannon | hawking
+# -DLLVM_CONFIG_EXECUTABLE=<path to private llvm-config version>
+# -DBUILD_TESTS=ON
+# Note PROJECT=shamrock is default.
+cmake_minimum_required(VERSION 2.6)
+
+# Project Options:
+OPTION(BUILD_TESTS "Set to ON to build minimal OpenCL tests" ON)
+OPTION(BUILD_SHARED_LIBS "Set to OFF to build static libraries" ON)
+
+# For external builds, paths to dependent libraries/packages are assumed to
+# be specified by environment variables.
+# For internal development setup a default path for dependent libs/pkgs
+if ("$ENV{DEFAULT_DEV_INSTALL_DIR}" STREQUAL "")
+ set(DEFAULT_DEV_INSTALL_DIR /opt/ti)
+else()
+ set(DEFAULT_DEV_INSTALL_DIR $ENV{DEFAULT_DEV_INSTALL_DIR})
+endif()
+
+if (${PROJECT} MATCHES "shannon")
+ project(opencl-dspc8681)
+ SET(SHANNON_BUILD on)
+ MESSAGE(STATUS "Build Target Is TI SHANNON")
+ set(SDK "${DEFAULT_DEV_INSTALL_DIR}/desktop-linux-sdk_01_00_00_07")
+elseif (${PROJECT} MATCHES "hawking")
+ project(opencl-66ak2h)
+ SET(HAWKING_BUILD on)
+ MESSAGE(STATUS "Build Target Is TI HAWKING")
+else()
+ project(shamrock)
+ SET(SHAMROCK_BUILD on)
+ MESSAGE(STATUS "Project is shamrock")
+endif()
+
+SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules)
+
+# project version
+SET(${PROJECT_NAME}_MAJOR_VERSION 0)
+SET(${PROJECT_NAME}_MINOR_VERSION 8)
+SET(${PROJECT_NAME}_PATCH_LEVEL 2)
+
+# OpenCL platform version
+SET(${PROJECT_NAME}_VERSION ${${PROJECT_NAME}_MAJOR_VERSION}.${${PROJECT_NAME}_MINOR_VERSION}.${${PROJECT_NAME}_PATCH_LEVEL})
+
+# Set SONAME for OpenCL library
+SET(${PROJECT_NAME}_SOVERSION ${${PROJECT_NAME}_MAJOR_VERSION})
+
+if (NOT SHAMROCK_BUILD)
+SET(CMAKE_INSTALL_PREFIX $ENV{HOME}/${PROJECT_NAME}-${${PROJECT_NAME}_VERSION})
+endif (NOT SHAMROCK_BUILD)
+# Pending install changes
+#if ("$ENV{TI_OCL_INSTALL_PREFIX}" STREQUAL "")
+# set(CMAKE_INSTALL_PREFIX $ENV{HOME}/${PROJECT_NAME}-${${PROJECT_NAME}_VERSION})
+#else()
+# set(CMAKE_INSTALL_PREFIX $ENV{TI_OCL_INSTALL_PREFIX}/${PROJECT_NAME}-${${PROJECT_NAME}_VERSION})
+#endif()
+
+MESSAGE(STATUS "Install path is ${CMAKE_INSTALL_PREFIX}")
+
+if (HAWKING_CROSS_COMPILE)
+ set(CMAKE_SKIP_RPATH TRUE)
+endif()
+
+if (SHAMROCK_BUILD)
+ ADD_DEFINITIONS("-DSHAMROCK_BUILD")
+endif()
+#ADD_DEFINITIONS("-DLOCK")
+
+SET(CMAKE_VERBOSE_MAKEFILE TRUE)
+
+# Find_Package() for LLVM/CLANG defined in CMAKE_MODULE_PATH
+# Do not use system installed versions
+Find_Package(LLVM REQUIRED)
+Find_Package(Clang REQUIRED)
+
+# OpenCL requires boost headers. If boost is installed to some directory
+# other than /usr/include then define BOOST_INCLUDEDIR as below:
+#set (BOOST_INCLUDEDIR path-to-boost-headers)
+#include_directories(${BOOST_INCLUDEDIR})
+
+SET (INCLUDE_INSTALL_DIR "include" CACHE PATH
+ "The directory the headers are installed in")
+
+# Set up install permissions for file, directories and binaries.
+# !!! Note that these variables are used by the child cmake files so these
+# variables must be set before the add_subdirectories() commands below
+# File permissions (664)
+set(OCL_FPERMS PERMISSIONS
+ OWNER_READ OWNER_WRITE
+ GROUP_READ GROUP_WRITE
+ WORLD_READ)
+
+# Directory permissions (775). Files within the directories are set as above.
+set(OCL_DPERMS FILE_${OCL_FPERMS}
+ DIRECTORY_PERMISSIONS
+ OWNER_READ OWNER_WRITE OWNER_EXECUTE
+ GROUP_READ GROUP_WRITE GROUP_EXECUTE
+ WORLD_READ WORLD_EXECUTE)
+
+# Binary permissions (775)
+set(OCL_BPERMS PERMISSIONS
+ OWNER_READ OWNER_WRITE OWNER_EXECUTE
+ GROUP_READ GROUP_WRITE GROUP_EXECUTE
+ WORLD_READ WORLD_EXECUTE)
+
+
+if (NOT OCL_EXAMPLES_DIR)
+set(OCL_EXAMPLES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../opencl_examples)
+endif()
+
+if (NOT OCL_MONITOR_DIR)
+set(OCL_MONITOR_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../opencl_monitor)
+endif()
+
+if (NOT OCL_BUILTINS_DIR)
+set(OCL_BUILTINS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
+
+if (SHAMROCK_BUILD)
+set(CLC_BUILTINS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/builtins)
+endif()
+
+add_subdirectory(src)
+add_subdirectory(util)
+if (NOT SHAMROCK_BUILD)
+add_subdirectory(clocl)
+add_subdirectory(${OCL_EXAMPLES_DIR} ${CMAKE_CURRENT_BINARY_DIR}/examples)
+add_subdirectory(${OCL_MONITOR_DIR} ${CMAKE_CURRENT_BINARY_DIR}/monitor)
+add_subdirectory(${OCL_BUILTINS_DIR}/lib/dsp)
+endif()
+
+if (NOT HAWKING_CROSS_COMPILE)
+IF (BUILD_TESTS)
+ ENABLE_TESTING()
+ Find_Package(Check REQUIRED)
+ add_subdirectory(tests)
+ENDIF (BUILD_TESTS)
+endif()
+
+# install OCL builtin headers in opencl-headers package installation directory.
+if (SHAMROCK_BUILD)
+install(DIRECTORY include DESTINATION /usr ${OCL_DPERMS})
+install(DIRECTORY include/CL DESTINATION /usr/include ${OCL_DPERMS})
+install(FILES include/clc.h DESTINATION /usr/include/CL )
+install(FILES include/cpu.h DESTINATION /usr/include/CL )
+endif (SHAMROCK_BUILD)
+
+# Following is TI product specific:
+if (NOT SHAMROCK_BUILD)
+# Create the top level directories with the desired install permissions
+install(DIRECTORY DESTINATION . ${OCL_DPERMS})
+install(DIRECTORY DESTINATION bin ${OCL_DPERMS})
+install(DIRECTORY DESTINATION lib ${OCL_DPERMS})
+
+install(FILES opencl-manifest.pdf DESTINATION doc ${OCL_FPERMS})
+install(DIRECTORY include DESTINATION . ${OCL_DPERMS})
+
+install(FILES doc/opencl-1.1.pdf DESTINATION doc ${OCL_FPERMS})
+install(FILES doc/opencl-cplusplus-1.1.pdf DESTINATION doc ${OCL_FPERMS})
+
+if (SHANNON_BUILD)
+ install(DIRECTORY DESTINATION init ${OCL_DPERMS})
+ install(DIRECTORY scripts DESTINATION . ${OCL_DPERMS})
+ install(DIRECTORY cmem DESTINATION . ${OCL_DPERMS})
+ install(FILES init/init_dspc8681.out DESTINATION lib ${OCL_FPERMS})
+ install(FILES init/init_dspc8682.out DESTINATION lib ${OCL_FPERMS})
+ install(PROGRAMS bin/init_global_shared_mem DESTINATION bin ${OCL_BPERMS})
+ install(PROGRAMS bin/oclenv DESTINATION bin ${OCL_BPERMS})
+ install(FILES readme_shannon.txt DESTINATION . RENAME readme.txt ${OCL_FPERMS})
+elseif(HAWKING_BUILD)
+ install(DIRECTORY DESTINATION bin/arm ${OCL_DPERMS})
+ install(DIRECTORY DESTINATION bin/x86 ${OCL_DPERMS})
+ install(FILES doc/opencl_readme.pdf DESTINATION doc ${OCL_FPERMS})
+endif()
+
+set (CPACK_GENERATOR "DEB")
+set (CPACK_DEBIAN_PACKAGE_MAINTAINER "TI")
+set (CPACK_DEBIAN_PACKAGE_NAME ${PROJECT_NAME})
+set (CPACK_DEBIAN_PACKAGE_ARCHITECTURE "all")
+
+set (CPACK_PACKAGE_VERSION_MAJOR ${${PROJECT_NAME}_MAJOR_VERSION})
+set (CPACK_PACKAGE_VERSION_MINOR ${${PROJECT_NAME}_MINOR_VERSION})
+set (CPACK_PACKAGE_VERSION_PATCH ${${PROJECT_NAME}_PATCH_LEVEL})
+set (CPACK_DEBIAN_PACKAGE_DEPENDS "mesa-common-dev (>=8.0.4-0), binutils-dev (>=2.22-6), libsqlite3-dev (>=3.7.9-2), libffi6 (>=3.0.11~rc1-5), zlib1g (>=1:1.2.3.4)")
+
+#Set where dpkg will put the install
+set (CPACK_INSTALL_PREFIX ${DEFAULT_DEV_INSTALL_DIR}/opencl/${PROJECT_NAME}-${${PROJECT_NAME}_VERSION})
+set (CPACK_SET_DESTDIR TRUE)
+
+
+include(CPack)
+endif(NOT SHAMROCK_BUILD)
diff --git a/CREDITS b/CREDITS
new file mode 100644
index 0000000..c027de3
--- /dev/null
+++ b/CREDITS
@@ -0,0 +1,7 @@
+Denis Steckelmacher
+TI OpenCL team
+Tom Gall
+Gil Pitney
+Show Liu
+Jia Jia
+
diff --git a/Doxyfile b/Doxyfile
new file mode 100644
index 0000000..0649e53
--- /dev/null
+++ b/Doxyfile
@@ -0,0 +1,1692 @@
+# Doxyfile 1.7.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = Clover
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER = Git
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = "OpenCL 1.1 software implementation"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+#PROJECT_LOGO = ./doc/logo.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = ./doc/
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF = "The $name class" \
+ "The $name widget" \
+ "The $name file" \
+ is \
+ provides \
+ specifies \
+ contains \
+ represents \
+ a \
+ an \
+ the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page. This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = ./src \
+ ./doc
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS = *.c \
+ *.cpp \
+ *.h \
+ *.dox
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE = ./src/runtime
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output. If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code. Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = YES
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = YES
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.freedesktop.clover
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# This tag can be used to set the number of enum values (range [0,1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+# Note that a value of 0 will completely suppress the enum values from
+# appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader. This is useful
+# if you want to understand what is going on. On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, svg, gif or svg.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..160fb95
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,129 @@
+Shamrock: an OpenCL implementation based on clover
+
+This is a continuation of the clover OpenCL project:
+ http://people.freedesktop.org/~steckdenis/clover
+
+based on the contributions from Texas Instruments for Keystone II DSP device:
+ git.ti.com/opencl
+
+and adding contributions from Linaro for ARM CPU-only support.
+
+Prereqs
+=======
+The following packages need to be installed on your system prior to build:
+
+gcc 4.8 (for building llvm)
+cmake
+check
+libboost-all-dev
+libtinfo-dev
+mesa-common-dev
+python 2.6+, and not greater or equal to v 3.0.
+
+BUILD
+=====
+
+LLVM Configuration:
+-------------------
+
+This was tested using LLVM 3.5.0 stable release from:
+http://llvm.org/releases/download.html
+
+Note: LLVM must be configured and built with certain options to link with shamrock for
+ARM.
+
+The following creates a release build for ARM, with LLVM installed
+into /opt/llvm:
+
+% CC=gcc CXX=g++ ./configure --prefix=/opt/llvm --enable-jit --enable-targets=arm --enable-optimized --enable-assertions --with-float=hard --with-abi=aapcs-vfp
+% make -j4 REQUIRES_RTTI=1
+% sudo make -j4 install
+
+See: http://llvm.org/releases/3.5.0/docs/HowToBuildOnARM.html for updates.
+
+Shamrock Build:
+---------------
+
+Current Branch: Khronos_conformance
+
+Usage: cmake <project_src_dir> <optional_defines>*
+ where <optional_defines*> are:
+ -DPROJECT=shamrock | shannon | hawking
+ -DLLVM_CONFIG_EXECUTABLE=<path to private llvm-config version>
+Note PROJECT=shamrock is default.
+
+The best way to compile is to use an out of src build, eg for a Debug build,
+and custom LLVM:
+
+% mkdir shamrock_build
+% cd shamrock_build
+% cmake -DLLVM_CONFIG_EXECUTABLE=/opt/llvm/bin/llvm-config -DCMAKE_BUILD_TYPE=Debug <path_to>/shamrock
+% make
+% sudo make install
+
+If your Clang is installed to a different location than LLVM,
+then define CLANG_INCLUDE_DIR and CLANG_LIB_DIR on the cmake cmd line:
+
+ -DCLANG_INCLUDE_DIR=/opt/clang/include -DCLANG_LIB_DIR=/opt/clang/lib
+
+
+SANITY TESTS
+============
+
+The build commands above will build some simple sanity tests.
+
+% cd shamrock_build
+% make test
+
+Latest Results:
+---------------
+
+shamrock_build> make test
+Running tests...
+/usr/bin/ctest --force-new-ctest-process
+Test project /home/gpitney/shamrock_build
+ Start 1: platform
+1/8 Test #1: platform ......................... Passed 0.11 sec
+ Start 2: device
+2/8 Test #2: device ........................... Passed 0.01 sec
+ Start 3: context
+3/8 Test #3: context .......................... Passed 0.01 sec
+ Start 4: commandqueue
+4/8 Test #4: commandqueue ..................... Passed 1.03 sec
+ Start 5: mem
+5/8 Test #5: mem .............................. Passed 0.01 sec
+ Start 6: kernel
+6/8 Test #6: kernel ...........................***Failed 0.90 sec
+ Start 7: program
+7/8 Test #7: program .......................... Passed 2.17 sec
+ Start 8: builtins
+8/8 Test #8: builtins ......................... Passed 1.53 sec
+
+88% tests passed, 1 tests failed out of 8
+
+PIGLIT TESTS
+============
+
+If running PIGLIT OpenCL tests, to build for the OpenCL piglit binaries only:
+
+% cd piglit
+% cmake -DPIGLIT_BUILD_CL_TESTS=ON -DPIGLIT_BUILD_GL_TESTS=OFF \
+ -DPIGLIT_USE_WAFFLE=OFF -DPIGLIT_USE_GLUT=OFF
+% export PIGLIT_CL_VERSION=11
+% make
+
+To run OpenCL tests, results in results/all_cl/main
+
+% piglit run tests/all_cl results/all_cl
+
+DEBUGGING OpenCL Kernels:
+=========================
+
+1. printf: A builtin function named "debug" maps to the printf symbol in the getBuiltin()
+ callback function, allowing printf from OpenCL kernels. Alternatively, this mechanism
+ can be used to define aribraty functions to be called back from kernels.
+
+2. gdb: Using the above getBuiltin() mechanism, a breakpoint can be placed in a callback
+ function at kernel exit, then stepping back into the kernel via gdb, will allow
+ debug of the kernel code (assembly level stepping).
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..84384e5
--- /dev/null
+++ b/TODO
@@ -0,0 +1,29 @@
+Things To Do:
+
+
+Features:
+=========
+
+1. Merge latest TI OpenCL DSP Device support
+
+2. Update to OpenCL v 1.2
+
+Issues:
+=======
+
+1. Khronos basic tests failures (both on ARM and x86_64), documented here:
+
+tests/basic_test_failures.lst
+
+2. Testing Full DSP Device support.
+
+Requires extra TI DSP compiler, DSP side files and builtin library which are not
+currently released.
+
+The build therefore defaults to SHAMROCK build type, for CPU Device only, and uses a CPU only
+builtins library. The clc.h, therefore, has not been tested with DSP Device builds.
+
+3. Sanity Test, one kernel test failure
+
+See README.txt for latest results.
+
diff --git a/bin/init_global_shared_mem b/bin/init_global_shared_mem
new file mode 100755
index 0000000..956cdc6
--- /dev/null
+++ b/bin/init_global_shared_mem
Binary files differ
diff --git a/bin/oclenv b/bin/oclenv
new file mode 100755
index 0000000..6ffa039
--- /dev/null
+++ b/bin/oclenv
@@ -0,0 +1,26 @@
+#!/bin/tcsh
+
+echo -n "\nMachine Name: "
+uname -n
+
+echo -n "Linux Release: "
+uname -r
+
+echo -n "Processor Type: "
+uname -p
+
+echo -n "TI_OCL_INSTALL: "
+printenv TI_OCL_INSTALL
+
+echo -n "\nLD_LIBRARY_PATH: "
+printenv LD_LIBRARY_PATH
+
+echo "\nPATH = "
+printenv PATH
+
+echo "\nCmem module: "
+lsmod | grep cmem
+
+echo "\nPCI Devices: "
+lspci -v -d:b005
+
diff --git a/clocl/.gitignore b/clocl/.gitignore
new file mode 100644
index 0000000..5761abc
--- /dev/null
+++ b/clocl/.gitignore
@@ -0,0 +1 @@
+*.o
diff --git a/clocl/CMakeLists.txt b/clocl/CMakeLists.txt
new file mode 100644
index 0000000..e5eec38
--- /dev/null
+++ b/clocl/CMakeLists.txt
@@ -0,0 +1,14 @@
+if (SHANNON_BUILD OR HAWKING_CROSS_COMPILE)
+ add_custom_command(OUTPUT x86/clocl COMMAND make -j4
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ add_custom_target(x86_clocl DEPENDS x86/clocl)
+ set(CROSS_TARGET cross)
+ install(PROGRAMS x86/clocl DESTINATION bin/x86 ${OCL_BPERMS})
+endif()
+
+if (HAWKING_BUILD)
+ add_custom_command(OUTPUT arm/clocl COMMAND make -j4 ${CROSS_TARGET} TARGET=arm
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ add_custom_target(arm_clocl DEPENDS arm/clocl)
+ install(PROGRAMS arm/clocl DESTINATION bin/arm ${OCL_BPERMS})
+endif()
diff --git a/clocl/Makefile b/clocl/Makefile
new file mode 100644
index 0000000..4ffccb7
--- /dev/null
+++ b/clocl/Makefile
@@ -0,0 +1,100 @@
+# If not specified, pick a default location for dependent llvm libraries
+LLVM_VERSION = 33
+
+ifeq ($(DEFAULT_DEV_INSTALL_DIR),)
+ DEFAULT_DEV_INSTALL_DIR = /opt/ti
+endif
+
+ifeq ($(ARM_LLVM_DIR),)
+ ARM_LLVM_DIR = $(DEFAULT_DEV_INSTALL_DIR)/llvm$(LLVM_VERSION)-install-arm
+endif
+
+ifeq ($(X86_LLVM_DIR),)
+ X86_LLVM_DIR = $(DEFAULT_DEV_INSTALL_DIR)/llvm$(LLVM_VERSION)-install-x86
+endif
+
+CLANG_LIBS = -lclangFrontendTool
+CLANG_LIBS += -lclangFrontend
+CLANG_LIBS += -lclangDriver
+CLANG_LIBS += -lclangSerialization
+CLANG_LIBS += -lclangCodeGen
+CLANG_LIBS += -lclangParse
+CLANG_LIBS += -lclangSema
+CLANG_LIBS += -lclangEdit
+CLANG_LIBS += -lclangAnalysis
+CLANG_LIBS += -lclangAST
+CLANG_LIBS += -lclangLex
+CLANG_LIBS += -lclangBasic
+
+
+EXE = clocl
+
+UNAME_M :=$(shell uname -m)
+ifneq (,$(findstring 86, $(UNAME_M)))
+ BUILD_PROCESSOR := x86
+ TARGET := x86
+ LLVM_DIR := $(X86_LLVM_DIR)
+ CXX := g++ -m32
+else ifneq (,$(findstring arm, $(UNAME_M)))
+ BUILD_PROCESSOR := arm
+ TARGET := arm
+ LLVM_DIR := $(ARM_LLVM_DIR)
+ CXX := g++
+endif
+
+LLVM_CONFIG_EXECUTABLE = $(LLVM_DIR)/bin/llvm-config
+LLVM_CXXFLAGS = `${LLVM_CONFIG_EXECUTABLE} --cxxflags `
+LLVM_LDFLAGS = -L $(LLVM_DIR)/lib -lpthread -lrt -ldl -lm
+LLVM_LIBS = `${LLVM_CONFIG_EXECUTABLE} --libs ${TARGET} asmparser bitwriter tablegen mcjit debuginfo interpreter irreader jit linker instrumentation ipo mcdisassembler`
+
+# If Cross Compiling for ARM override some make variables
+cross: override CXX=arm-linux-gnueabihf-g++
+cross: override TARGET=arm
+cross: override LLVM_CONFIG_EXECUTABLE=$(ARM_LLVM_DIR)/bin/llvm-config-host
+cross: override LLVM_LDFLAGS=-L $(ARM_LLVM_DIR)/lib -lpthread -lrt -ldl -lm
+# Need to explicitly add /usr/include when cross compiling to pick up
+# dependent 3rd party non-system headers
+cross: override HOST_USR_INCLUDE=-I/usr/include
+
+
+WGADIR = ../src/core/dsp
+POCLDIR = ../src/llvmopencl
+OBJS = AllocasToEntry.o BarrierBlock.o BarrierTailReplication.o \
+ BreakConstantGEPs.o CanonicalizeBarriers.o Flatten.o \
+ GenerateHeader.o ImplicitLoopBarriers.o IsolateRegions.o \
+ Kernel.o LLVMUtils.o LoopBarriers.o ParallelRegion.o \
+ PHIsToAllocas.o TargetAddressSpaces.o \
+ VariableUniformityAnalysis.o WIVectorize.o Workgroup.o \
+ WorkItemAliasAnalysis.o WorkitemHandler.o \
+ WorkitemHandlerChooser.o WorkitemLoops.o WorkitemReplication.o\
+ main.o compiler.o wga.o program.o file_manip.o options.o
+
+OBJS := $(patsubst %.o, $(TARGET)/%.o, $(OBJS))
+
+CXXFLAGS = ${LLVM_CXXFLAGS} -I${WGADIR} -I${POCLDIR} \
+ ${HOST_USR_INCLUDE} -O3 -fexceptions
+LIBS = ${CLANG_LIBS} ${LLVM_LIBS}
+LDFLAGS = ${LLVM_LDFLAGS}
+
+$(EXE): ${OBJS}
+ $(CXX) $^ $(LIBS) $(LDFLAGS) -o $(TARGET)/$@
+
+cross: $(EXE)
+
+$(TARGET)/%.o: %.cpp | $(TARGET)/
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(TARGET)/%.o: ${WGADIR}/%.cpp | $(TARGET)/
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(TARGET)/%.o: ${POCLDIR}/%.cpp | $(TARGET)/
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(TARGET)/%.o: ${POCLDIR}/%.cc | $(TARGET)/
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(TARGET)/:
+ mkdir -p $@
+
+clean:
+ rm -f x86/* arm/*
diff --git a/clocl/compiler.cpp b/clocl/compiler.cpp
new file mode 100644
index 0000000..90ff0ae
--- /dev/null
+++ b/clocl/compiler.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.cpp
+ * \brief Compiler wrapper around Clang
+ */
+
+#include "compiler.h"
+#include "options.h"
+
+#include <cstring>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <clang/Frontend/CompilerInvocation.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Frontend/LangStandard.h>
+#include <clang/Basic/Diagnostic.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/MemoryBuffer.h> // ASW
+#include <llvm/IR/Module.h>
+#include <llvm/IR/LLVMContext.h>
+
+std::string get_ocl_dsp();
+
+Compiler::Compiler()
+: p_module(0), p_optimize(true), p_log_stream(p_log),
+ p_log_printer(0)
+{
+}
+
+Compiler::~Compiler()
+{
+
+}
+
+bool Compiler::compile(const std::string &options,
+ llvm::MemoryBuffer *source,
+ std::string filename)
+{
+ /* Set options */
+ p_options = options;
+
+ clang::CodeGenOptions &codegen_opts = p_compiler.getCodeGenOpts();
+ clang::DiagnosticOptions &diag_opts = p_compiler.getDiagnosticOpts();
+ clang::FrontendOptions &frontend_opts = p_compiler.getFrontendOpts();
+ clang::HeaderSearchOptions &header_opts = p_compiler.getHeaderSearchOpts();
+ clang::LangOptions &lang_opts = p_compiler.getLangOpts();
+ clang::TargetOptions &target_opts = p_compiler.getTargetOpts();
+ clang::PreprocessorOptions &prep_opts = p_compiler.getPreprocessorOpts();
+ clang::CompilerInvocation &invocation = p_compiler.getInvocation();
+
+ // Set codegen options
+ codegen_opts.setDebugInfo(clang::CodeGenOptions::NoDebugInfo);
+ codegen_opts.AsmVerbose = true;
+
+ // level 3 is too much for the pocl transformations.
+ codegen_opts.OptimizationLevel = 2;
+
+ // Set diagnostic options
+ diag_opts.Pedantic = true;
+ diag_opts.ShowColumn = true;
+ diag_opts.ShowLocation = true;
+ diag_opts.ShowCarets = false;
+ diag_opts.ShowFixits = true;
+ diag_opts.ShowColors = false;
+ diag_opts.ErrorLimit = 19;
+ diag_opts.MessageLength = 0;
+
+ // Set frontend options
+ frontend_opts.ProgramAction = clang::frontend::EmitLLVMOnly;
+ frontend_opts.DisableFree = true;
+
+ // Set header search options
+ header_opts.Verbose = false;
+ header_opts.UseBuiltinIncludes = false;
+ header_opts.UseStandardSystemIncludes = false;
+ header_opts.UseStandardCXXIncludes = false;
+
+ // Set preprocessor options
+ prep_opts.RetainRemappedFileBuffers = true;
+ if (!opt_builtin)
+ {
+ prep_opts.Includes.push_back("clc.h");
+ prep_opts.Includes.push_back("dsp.h");
+ }
+
+ // Set lang options
+ lang_opts.NoBuiltin = true;
+ lang_opts.OpenCL = true;
+ lang_opts.CPlusPlus = false;
+
+ // Set target options
+ // For 6X, use the 'spir' target as it implements opencl specs
+ target_opts.Triple = "spir-unknown-unknown-unknown";
+
+ // Currently, llp6x does not handle fused multiply and add
+ // llvm intrinsics (llvm.fmuladd.*). Disable generating these
+ // intrinsics using clang -ffp-contract=off option
+ codegen_opts.setFPContractMode(clang::CodeGenOptions::FPC_Off);
+
+ // Parse the user options
+ std::istringstream options_stream(options);
+ std::string token;
+ bool Werror = false, inI = false, inD = false;
+
+ /*-------------------------------------------------------------------------
+ * Add OpenCL C header path as a default location for searching for headers
+ *------------------------------------------------------------------------*/
+ header_opts.AddPath(get_ocl_dsp(), clang::frontend::Angled, false, false);
+
+ while (options_stream >> token)
+ {
+ if (inI)
+ {
+ // token is an include path
+ header_opts.AddPath(token, clang::frontend::Angled, false, false);
+ inI = false;
+ continue;
+ }
+ else if (inD)
+ {
+ // token is name or name=value
+ prep_opts.addMacroDef(token);
+ }
+
+ if (token == "-I")
+ {
+ inI = true;
+ }
+ else if (token == "-D")
+ {
+ inD = true;
+ }
+ else if (token == "-cl-single-precision-constant")
+ {
+ lang_opts.SinglePrecisionConstants = true;
+ }
+ else if (token == "-cl-opt-disable")
+ {
+ p_optimize = false;
+ codegen_opts.OptimizationLevel = 0;
+ }
+ else if (token == "-cl-mad-enable")
+ {
+ codegen_opts.LessPreciseFPMAD = true;
+ }
+ else if (token == "-cl-unsafe-math-optimizations")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ }
+ else if (token == "-cl-finite-math-only")
+ {
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ }
+ else if (token == "-cl-fast-relaxed-math")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ lang_opts.FastRelaxedMath = true;
+ }
+ else if (token == "-w")
+ {
+ diag_opts.IgnoreWarnings = true;
+ }
+ else if (token == "-Werror")
+ {
+ Werror = true;
+ }
+ }
+
+ // Set invocation options
+ //invocation.setLangDefaults(lang_opts,clang::IK_OpenCL);
+ invocation.setLangDefaults(lang_opts,clang::IK_OpenCL, clang::LangStandard::lang_opencl12);
+
+ // Create the diagnostics engine
+ p_log_printer = new clang::TextDiagnosticPrinter(p_log_stream, &diag_opts);
+ p_compiler.createDiagnostics(p_log_printer);
+
+ if (!p_compiler.hasDiagnostics())
+ return false;
+
+ p_compiler.getDiagnostics().setWarningsAsErrors(Werror);
+
+ // Feed the compiler with source
+ frontend_opts.Inputs.push_back(clang::FrontendInputFile(filename.c_str(), clang::IK_OpenCL));
+
+ std::string srcc = source->getBuffer();
+ const llvm::StringRef s_data(srcc);
+ const llvm::StringRef s_name("<source>");
+ llvm::MemoryBuffer *buffer =
+ llvm::MemoryBuffer::getMemBuffer(s_data, s_name);
+
+ prep_opts.addRemappedFile(filename.c_str(), buffer);
+
+ // Compile
+ llvm::OwningPtr<clang::CodeGenAction> act(
+ new clang::EmitLLVMOnlyAction(&llvm::getGlobalContext())
+ );
+
+ if (!p_compiler.ExecuteAction(*act))
+ {
+ // DEBUG
+ std::cout << log() << std::endl;
+ return false;
+ }
+
+ p_log_stream.flush();
+ p_module = act->takeModule();
+
+ // uncomment to debug the llvm IR
+ // p_module->dump();
+
+ return true;
+}
+
+const std::string &Compiler::log() const
+{
+ return p_log;
+}
+
+const std::string &Compiler::options() const
+{
+ return p_options;
+}
+
+bool Compiler::optimize() const
+{
+ return p_optimize;
+}
+
+llvm::Module *Compiler::module() const
+{
+ return p_module;
+}
+
+void Compiler::appendLog(const std::string &log)
+{
+ p_log += log;
+}
diff --git a/clocl/compiler.h b/clocl/compiler.h
new file mode 100644
index 0000000..2195cc2
--- /dev/null
+++ b/clocl/compiler.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.h
+ * \brief Compiler wrapped around Clang
+ */
+
+#ifndef __COMPILER_H__
+#define __COMPILER_H__
+
+#include <string>
+
+#include <clang/Frontend/CompilerInstance.h>
+#include <llvm/Support/raw_ostream.h>
+
+namespace llvm
+{
+ class MemoryBuffer;
+ class Module;
+}
+
+namespace clang
+{
+ class TextDiagnosticPrinter;
+}
+
+
+/**
+ * \brief Compiler using Clang
+ *
+ * This class builds a Clang instance, runs it and then retains compilation logs
+ * and produced data.
+ */
+class Compiler
+{
+ public:
+ /**
+ * \brief Constructor
+ */
+ Compiler();
+ ~Compiler();
+
+ /**
+ * \brief Compile \p source to produce a LLVM module
+ * \param options options given to the compiler, described in the OpenCL spec
+ * \param source source to be compiled
+ * \return true if the compilation is successful, false otherwise
+ * \sa module()
+ * \sa log()
+ */
+ bool compile(const std::string &options, llvm::MemoryBuffer *source,
+ std::string filename);
+
+ /**
+ * \brief Compilation log
+ * \note \c appendLog() can also be used to append custom info at the end
+ * of the log, for instance to keep compilation and linking logs
+ * in the same place
+ * \return log
+ */
+ const std::string &log() const;
+
+ /**
+ * \brief Options given at \c compile()
+ * \return options used during compilation
+ */
+ const std::string &options() const;
+
+ /**
+ * \brief Optimization enabled
+ * \return true if -cl-opt-disable was given in the options, false otherwise
+ */
+ bool optimize() const;
+
+ /**
+ * \brief LLVM module generated
+ * \return LLVM module generated by the compilation, 0 if an error occured
+ */
+ llvm::Module *module() const;
+
+ /**
+ * \brief Append a string to the log
+ *
+ * This function can be used to append linking or code-gen logs to the
+ * internal compilation log kept by this class
+ *
+ * \param log log to be appended
+ */
+ void appendLog(const std::string &log);
+
+ private:
+ clang::CompilerInstance p_compiler;
+ llvm::Module *p_module;
+ bool p_optimize;
+
+ std::string p_log, p_options;
+ llvm::raw_string_ostream p_log_stream;
+ clang::TextDiagnosticPrinter *p_log_printer;
+};
+
+#endif
diff --git a/clocl/file_manip.cpp b/clocl/file_manip.cpp
new file mode 100644
index 0000000..5a51f16
--- /dev/null
+++ b/clocl/file_manip.cpp
@@ -0,0 +1,46 @@
+#include "file_manip.h"
+#include <unistd.h>
+
+bool fs_exists(std::string path)
+{
+ return (access(path.c_str(), F_OK) == 0);
+}
+
+std::string fs_filename(std::string path)
+{
+ int name_begin = path.rfind("/");
+ if (name_begin == std::string::npos) return path;
+ return path.substr(name_begin+1, path.size()-name_begin+1);
+}
+
+std::string fs_stem(std::string path)
+{
+ path = fs_filename(path);
+ int ext_begin = path.rfind(".");
+ if (ext_begin == std::string::npos) return path;
+ return path.substr(0, ext_begin);
+}
+
+std::string fs_ext(std::string path)
+{
+ int ext_begin = path.rfind(".");
+ if (ext_begin == std::string::npos) return "";
+ return path.substr(ext_begin, path.size()-ext_begin);
+}
+
+std::string fs_path(std::string path)
+{
+ int path_end = path.rfind("/");
+ if (path_end == std::string::npos) return "";
+ return path.substr(0, path_end+1);
+}
+
+std::string fs_replace_extension(std::string path, std::string ext)
+{
+ if (fs_ext(path) == "") return path;
+
+ path = fs_path(path) + fs_stem(path);
+
+ if (ext[0] == '.') return path + ext;
+ else return path + "." + ext;
+}
diff --git a/clocl/file_manip.h b/clocl/file_manip.h
new file mode 100644
index 0000000..7084239
--- /dev/null
+++ b/clocl/file_manip.h
@@ -0,0 +1,12 @@
+#ifndef _FILE_MANIP_H_
+#define _FILE_MANIP_H_
+#include <string>
+
+bool fs_exists (std::string path);
+std::string fs_filename (std::string path);
+std::string fs_stem (std::string path);
+std::string fs_ext (std::string path);
+std::string fs_path (std::string path);
+std::string fs_replace_extension(std::string path, std::string ext);
+
+#endif // _FILE_MANIP_H_
diff --git a/clocl/main.cpp b/clocl/main.cpp
new file mode 100644
index 0000000..05f0925
--- /dev/null
+++ b/clocl/main.cpp
@@ -0,0 +1,396 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+#include <sys/stat.h>
+
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Linker.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/InstIterator.h>
+
+#include "compiler.h"
+#include "wga.h"
+#include "file_manip.h"
+#include "options.h"
+
+#include <WorkitemHandlerChooser.h>
+#include <BreakConstantGEPs.h>
+#include <Flatten.h>
+#include <PHIsToAllocas.h>
+#include <IsolateRegions.h>
+#include <VariableUniformityAnalysis.h>
+#include <ImplicitLoopBarriers.h>
+#include <LoopBarriers.h>
+#include <BarrierTailReplication.h>
+#include <CanonicalizeBarriers.h>
+#include <WorkItemAliasAnalysis.h>
+#include <WorkitemReplication.h>
+#include <WorkitemLoops.h>
+#include <AllocasToEntry.h>
+#include <Workgroup.h>
+#include <TargetAddressSpaces.h>
+
+using namespace std;
+using llvm::Module;
+
+bool prepend_headers(string filename, string& source);
+bool run_clang (string filename, string source, Compiler &compiler,
+ Module **module, string& binary);
+bool llvm_xforms (Module *module, bool optimize);
+bool cl6x (string& filename, string &binary_str);
+
+int run_cl6x (string filename, string *llvm_bitcode, string options);
+void write_binary (string filename, const char *buf, int size);
+void write_text (string filename);
+
+
+void write_bitcode(string filename, Module* module)
+{
+ string bc_file(fs_filename(fs_replace_extension(filename, ".bc")));
+ string err_info;
+
+ llvm::raw_fd_ostream file_ostream(bc_file.c_str(), err_info);
+ llvm::WriteBitcodeToFile(module, file_ostream);
+ file_ostream.flush();
+}
+
+/******************************************************************************
+* main
+******************************************************************************/
+int main(int argc, char *argv[])
+{
+ process_options(argc, argv);
+
+ if (files_clc.empty()) return 0;
+
+ string filename = files_clc[0];
+ string source; // OpenCL C program source
+ string binary; // Untransformed LLVM bitcode (still per workitem)
+ Module *module; // module, evolves during transformation
+ Compiler compiler;
+
+ if (!prepend_headers(filename, source)) exit(-1);
+ if (!run_clang (filename, source, compiler, &module, binary)) exit(-1);
+ if (!llvm_xforms (module, compiler.optimize())) exit(-1);
+
+ write_bitcode(filename, module);
+
+ if (!cl6x (filename, binary)) exit(-1);
+
+ if (opt_txt) write_text(filename);
+}
+
+
+
+/******************************************************************************
+* prepend_headers
+******************************************************************************/
+bool prepend_headers(string filename, string& source)
+{
+ /*---------------------------------------------------------------------
+ * Compile the Kernel Source for the device
+ *--------------------------------------------------------------------*/
+ if (!fs_exists(filename))
+ { cout << "File " << filename << " doesn't exist" << endl; return false; }
+
+ stringstream userSrc;
+ userSrc << ifstream(filename.c_str()).rdbuf();
+
+ /*-------------------------------------------------------------------------
+ * Prepend OpenCL header info into the source
+ *------------------------------------------------------------------------*/
+ source = userSrc.str();
+ return true;
+}
+
+/******************************************************************************
+* run_clang
+******************************************************************************/
+bool run_clang(string filename, string source, Compiler &compiler,
+ Module **module, string& binary)
+{
+ using llvm::MemoryBuffer;
+ using llvm::StringRef;
+
+ const StringRef s_data(source);
+ const StringRef s_name("<source>");
+
+ MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(s_data, s_name);
+
+ if (opt_verbose) cout << "clang options: " << cl_options << endl;
+ if (!compiler.compile(cl_options, buffer, filename))
+ return false;
+
+ *module = compiler.module();
+
+ llvm::raw_string_ostream str_ostream(binary);
+ llvm::WriteBitcodeToFile(*module, str_ostream);
+ str_ostream.flush();
+
+ return true;
+}
+
+/******************************************************************************
+* llvm_xforms
+******************************************************************************/
+bool llvm_xforms(Module *module, bool optimize)
+{
+ // Get list of kernels to strip other unused functions
+ vector<const char *> api;
+ vector<string> api_s; // Needed to keep valid data in api
+
+ llvm::NamedMDNode *kern_meta = module->getNamedMetadata("opencl.kernels");
+
+ for (unsigned int i=0; kern_meta && i < kern_meta->getNumOperands(); ++i)
+ {
+ llvm::MDNode *node = kern_meta->getOperand(i);
+ llvm::Value *value = node->getOperand(0);
+ if (!llvm::isa<llvm::Function>(value)) continue;
+
+ llvm::Function *f = llvm::cast<llvm::Function>(value);
+ string s = f->getName().str();
+ api_s.push_back(s);
+ api.push_back(s.c_str());
+ }
+
+ // determine if module has barrier() function calls
+ bool hasBarrier = false;
+ llvm::CallInst* call;
+ for (Module::iterator F = module->begin(),
+ EF = module->end(); !hasBarrier && F != EF; ++F)
+ for (llvm::inst_iterator I = inst_begin(*F),
+ E = inst_end(*F); I != E; ++I)
+ {
+ if (!(call = llvm::dyn_cast<llvm::CallInst>(&*I))) continue;
+ if (!call->getCalledFunction()) continue;
+ string name(call->getCalledFunction()->getName());
+ if (name == "barrier")
+ {
+ hasBarrier = true;
+ break;
+ }
+ }
+
+ // Optimize code
+ llvm::PassManager *manager = new llvm::PassManager();
+
+ // Common passes (primary goal : remove unused stdlib functions)
+ manager->add(llvm::createTypeBasedAliasAnalysisPass());
+ manager->add(llvm::createBasicAliasAnalysisPass());
+
+ /*-------------------------------------------------------------------------
+ * Do not run this for lib mode as it will result in all functions
+ * being removed if main not found.
+ *------------------------------------------------------------------------*/
+ if (!opt_lib) manager->add(llvm::createInternalizePass(api));
+
+ manager->add(llvm::createIPSCCPPass());
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createConstantMergePass());
+ manager->add(llvm::createAlwaysInlinerPass());
+
+ // pocl barrier transformation
+ if (hasBarrier)
+ {
+ manager->add(new llvm::DominatorTree());
+ manager->add(new pocl::WorkitemHandlerChooser());
+ manager->add(new BreakConstantGEPs()); // from pocl
+ // add(new GenerateHeader()); // no need
+ manager->add(new pocl::Flatten());
+ manager->add( llvm::createAlwaysInlinerPass());
+ manager->add( llvm::createGlobalDCEPass());
+ manager->add( llvm::createCFGSimplificationPass());
+ manager->add( llvm::createLoopSimplifyPass());
+ manager->add(new pocl::PHIsToAllocas());
+ manager->add( llvm::createRegionInfoPass());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::VariableUniformityAnalysis()); // TODO
+ manager->add(new pocl::ImplicitLoopBarriers());
+ manager->add(new pocl::LoopBarriers());
+ manager->add(new pocl::BarrierTailReplication());
+ manager->add(new pocl::CanonicalizeBarriers());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::WorkItemAliasAnalysis());
+ // add(new pocl::WorkitemReplication()); // no need
+ manager->add(new pocl::WorkitemLoops());
+ manager->add(new pocl::AllocasToEntry());
+ // add(new pocl::Workgroup()); // no need
+ manager->add(new pocl::TargetAddressSpaces());
+ }
+
+ if (optimize)
+ {
+ /*---------------------------------------------------------------------
+ * Inspired by code from "The LLVM Compiler Infrastructure"
+ *--------------------------------------------------------------------*/
+ manager->add(llvm::createDeadArgEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createFunctionInliningPass());
+ manager->add(llvm::createPruneEHPass()); // Remove dead EH info.
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createGlobalDCEPass()); // Remove dead functions.
+ manager->add(llvm::createArgumentPromotionPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+
+ //ASW TODO maybe turn off re: pete. might gen bad xlator input
+ //manager->add(llvm::createScalarReplAggregatesPass());
+
+ manager->add(llvm::createFunctionAttrsPass()); // Add nocapture.
+ manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis.
+ manager->add(llvm::createLICMPass()); // Hoist loop invariants.
+ manager->add(llvm::createGVNPass()); // Remove redundancies.
+ manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys.
+ manager->add(llvm::createDeadStoreEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createCFGSimplificationPass());
+ }
+
+ /*-------------------------------------------------------------------------
+ * Builtins will not have workitem functions and do not need wga
+ *------------------------------------------------------------------------*/
+ if (!opt_builtin)
+ {
+ manager->add(llvm::createUnifyFunctionExitNodesPass());
+ manager->add(llvm::createTIOpenclWorkGroupAggregationPass(hasBarrier));
+
+ /*---------------------------------------------------------------------
+ * Borrow the pocl alloca hoister for the TI simplistic WGA pass as well
+ *--------------------------------------------------------------------*/
+ if (!hasBarrier)
+ manager->add(new pocl::AllocasToEntry());
+ }
+
+ manager->add(llvm::createGlobalDCEPass());
+ manager->run(*module);
+ delete manager;
+
+ return true;
+}
+
+
+/******************************************************************************
+* cl6x
+******************************************************************************/
+bool cl6x(string& filename, string &binary_str)
+{
+ string bc_file_full(fs_replace_extension(filename, ".bc"));
+ string bc_file (fs_filename(bc_file_full));
+
+ run_cl6x(bc_file, &binary_str, files_other);
+
+ /*-------------------------------------------------------------------------
+ * Clean up temporary files
+ *------------------------------------------------------------------------*/
+ struct stat statbuf;
+ if (!opt_keep)
+ {
+ const char *name = bc_file_full.c_str();
+
+ unlink(name);
+
+ if (!opt_lib)
+ {
+ name = fs_replace_extension(bc_file_full, ".obj").c_str();
+ unlink(name);
+ }
+
+ string bitasm_name(fs_stem(bc_file_full));
+ bitasm_name += "_bc.asm";
+ name = bitasm_name.c_str();
+
+ unlink(name);
+
+ bitasm_name =fs_stem(bc_file_full);
+ bitasm_name += "_bc.obj";
+ name = bitasm_name.c_str();
+
+ unlink(name);
+ }
+
+ return true;
+}
+
+/******************************************************************************
+* write_text
+******************************************************************************/
+void write_text(string filename)
+{
+ filename = fs_filename(filename);
+ string outfile(fs_replace_extension(filename, ".out"));
+ string hfile (fs_replace_extension(filename, ".dsp_h"));
+
+ stringstream bufss;
+ bufss << ifstream(outfile.c_str()).rdbuf();
+
+ string buf(bufss.str());
+
+ ofstream header(hfile.c_str(), ios::out);
+
+ header << "unsigned int " << fs_stem(filename)
+ << "_dsp_bin_len = " << buf.length() << ";"
+ << endl;
+
+ header << "char " << fs_stem(filename) << "_dsp_bin[] = { ";
+
+ int val = buf[0] & 0xff;
+ header << "0x"<< hex << setfill('0') << setw(2) << nouppercase <<val<<endl;
+
+ for (int i = 1; i < buf.length(); i++)
+ {
+ val = buf[i] & 0xff;
+ header << ", 0x"<< hex << setfill('0') << setw(2) << nouppercase <<val;
+ if (i % 13 == 0) header << endl;
+ }
+
+ header << endl << "};" << endl;
+ header.close();
+}
+
diff --git a/clocl/options.cpp b/clocl/options.cpp
new file mode 100644
index 0000000..a6d9a87
--- /dev/null
+++ b/clocl/options.cpp
@@ -0,0 +1,223 @@
+#include <stdio.h> /* for printf */
+#include <stdlib.h> /* for exit */
+#include <getopt.h>
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include "file_manip.h"
+
+using std::cout;
+using std::endl;
+using std::string;
+using std::vector;
+using std::ostream_iterator;
+
+int opt_help = 0;
+int opt_verbose = 0;
+int opt_keep = 0;
+int opt_debug = 0;
+int opt_lib = 0;
+int opt_txt = 0;
+int opt_w = 0;
+int opt_Werror = 0;
+int opt_builtin = 0;
+
+string cl_options;
+string cl_incdef;
+string opts_other;
+vector<string> files_clc;
+vector<string> files_c;
+string files_other;
+
+/******************************************************************************
+* void print_options()
+******************************************************************************/
+void print_options()
+{
+ cout << endl;
+
+ if (opt_keep) printf ("Option keep : on\n");
+ if (opt_debug) printf ("Option debug : on\n");
+ if (opt_lib) printf ("Option lib : on\n");
+ if (opt_txt) printf ("Option txt : on\n");
+ if (opt_w) printf ("Option w : on\n");
+ if (opt_Werror) printf ("Option Werror : on\n");
+ //if (opt_builtin) printf ("Option builtin: on\n");
+
+ cout << endl;
+
+ cout << "CL C Options : " << cl_options << endl;
+ cout << "Incls/Defines : " << cl_incdef << endl;
+ cout << "CL C file : " << files_clc[0] << endl;
+ cout << "Link Files : " << files_other << endl;
+
+ cout << endl;
+
+ cout << "Ignored Opts : " << opts_other << endl;
+ cout << "Ignored Files : ";
+ if (files_clc.size() > 1)
+ copy(files_clc.begin()+1, files_clc.end(), ostream_iterator<string>(cout, " "));
+
+ copy(files_c.begin(), files_c.end(), ostream_iterator<string>(cout, " "));
+ cout << endl << endl;
+}
+
+/******************************************************************************
+* void print_help()
+******************************************************************************/
+void print_help()
+{
+ cout << endl;
+ cout << "Usage: clocl [options] <OpenCL C file> [<link files>]" << endl;
+ cout << endl;
+
+ cout << "Options passed to clocl are either options to control" << endl;
+ cout << "clocl behavior or they are documented OpenCL 1.1 build" << endl;
+ cout << "options." << endl;
+ cout << endl;
+ cout << "The clocl behavior options are: " << endl;
+ cout << " -h, --help : Print this help screen" << endl;
+ cout << " -v, --verbose : Print verbose messages" << endl;
+ cout << " -k, --keep : Do not delete temp compilation files" << endl;
+ cout << " -g, --debug : Generate debug symbols" << endl;
+ cout << " -t, --txt : Generate object in header form" << endl;
+ cout << " -l, --lib : Do not link. Stop after compilation." << endl;
+ cout << endl;
+ cout << "The OpenCL 1.1 build options. Refer to 1.1 spec for desc:" << endl;
+ cout << " -D<name>" << endl;
+ cout << " -D<name>=<val>" << endl;
+ cout << " -I<dir>" << endl;
+ cout << " -w" << endl;
+ cout << " -Werror" << endl;
+ cout << " -cl-single-precision-constant" << endl;
+ cout << " -cl-denorms-are-zero" << endl;
+ cout << " -cl-opt-disable" << endl;
+ cout << " -cl-mad-enable" << endl;
+ cout << " -cl-no-signed-zeros" << endl;
+ cout << " -cl-unsafe-math-optimizations" << endl;
+ cout << " -cl-finite-math-only" << endl;
+ cout << " -cl-fast-relaxed-math" << endl;
+ cout << " -cl-std=<val>" << endl;
+ cout << endl;
+ exit(-1);
+}
+
+/******************************************************************************
+* void process_options(int argc, char **argv)
+******************************************************************************/
+void process_options(int argc, char **argv)
+{
+ int c;
+ int digit_optind = 0;
+
+ while (1)
+ {
+ static struct option long_options[] = {
+
+ /*-----------------------------------------------------------------
+ * clocl options
+ *----------------------------------------------------------------*/
+ {"help", no_argument, &opt_help, 'h' },
+ {"verbose", no_argument, &opt_verbose, 'v' },
+ {"keep", no_argument, &opt_keep, 'k' },
+ {"debug", no_argument, &opt_debug, 'g' },
+ {"lib", no_argument, &opt_lib, 'l' },
+ {"txt", no_argument, &opt_txt, 't' },
+ {"builtin", no_argument, &opt_builtin, 'b' },
+
+ /*-----------------------------------------------------------------
+ * opencl 1.1 options
+ *----------------------------------------------------------------*/
+ {"Werror", no_argument, 0, 0 },
+ {"cl-std", required_argument, 0, 0 },
+ {"cl-single-precision-constant", no_argument, 0, 0 },
+ {"cl-denorms-are-zero", no_argument, 0, 0 },
+ {"cl-opt-disable", no_argument, 0, 0 },
+ {"cl-mad-enable", no_argument, 0, 0 },
+ {"cl-no-signed-zeros", no_argument, 0, 0 },
+ {"cl-unsafe-math-optimizations", no_argument, 0, 0 },
+ {"cl-finite-math-only", no_argument, 0, 0 },
+ {"cl-fast-relaxed-math", no_argument, 0, 0 },
+ {0, 0, 0, 0 }
+ };
+
+ int this_option_optind = optind ? optind : 1;
+ int option_index = 0;
+
+ opterr = 0; // prevent getopt from printing warnings
+
+ c = getopt_long_only(argc, argv, "-gwI:D:", long_options,
+ &option_index);
+ if (c == -1) break;
+
+ switch (c)
+ {
+ case 0:
+ {
+ string name(long_options[option_index].name);
+
+ if (name == "help" || name == "verbose" ||
+ name == "keep" || name == "debug" ||
+ name == "lib" || name == "txt" ||
+ name == "builtin") break;
+
+ if (name == "cl-std")
+ {
+ cl_options += " -cl-std=";
+ cl_options += optarg;
+ break;
+ }
+
+ if (name == "Werror") opt_Werror = 1; // fall-through
+ if (name == "cl-opt-disable") opt_debug = 1; // fall-through
+
+ cl_options += " ";
+ cl_options += argv[this_option_optind];
+ break;
+ }
+
+ case 1:
+ {
+ string fname(argv[this_option_optind]);
+ string ext(fs_ext(fname));
+
+ if (ext == ".clc") files_clc.push_back(fname);
+ else if (ext == ".cl") files_clc.push_back(fname);
+ else if (ext == ".c") files_c.push_back(fname);
+ else { files_other += fname; files_other += " "; }
+
+ break;
+ }
+
+ case 'g': opt_debug = 1; break;
+ case 'w': opt_w = 1; cl_options += " -w"; break;
+
+ case 'D':
+ case 'I':
+ cl_incdef += " -";
+ cl_incdef += c;
+ cl_incdef += optarg;
+ break;
+
+ case '?':
+ opts_other += " ";
+ opts_other += argv[this_option_optind];
+ break;
+
+ default:
+ opts_other += " -";
+ opts_other += c;
+ break;
+ }
+ }
+
+ if (opt_verbose) print_options();
+ if (opt_help) print_help();
+
+ cl_options += " ";
+ cl_options += cl_incdef;
+}
diff --git a/clocl/options.h b/clocl/options.h
new file mode 100644
index 0000000..086e49c
--- /dev/null
+++ b/clocl/options.h
@@ -0,0 +1,24 @@
+#ifndef _OPTIONS_H_
+#define _OPTIONS_H_
+
+#include <string>
+
+extern int opt_help;
+extern int opt_verbose;
+extern int opt_keep;
+extern int opt_debug;
+extern int opt_lib;
+extern int opt_txt;
+extern int opt_w;
+extern int opt_Werror;
+extern int opt_builtin;
+
+extern std::string cl_options;
+extern std::string cl_incdef;
+extern std::vector<std::string> files_clc;
+extern std::vector<std::string> files_c;
+extern std::string files_other;
+
+void process_options(int argc, char **argv);
+
+#endif //_OPTIONS_H_
diff --git a/clocl/program.cpp b/clocl/program.cpp
new file mode 100644
index 0000000..0674bbe
--- /dev/null
+++ b/clocl/program.cpp
@@ -0,0 +1,189 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include "wga.h"
+#include "file_manip.h"
+#include "options.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <elf.h>
+
+using namespace std;
+
+/******************************************************************************
+* Find the C6000 CGT installation
+******************************************************************************/
+char *get_cgt_install()
+{
+ char *install = getenv("TI_OCL_CGT_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_CGT_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the C6000 compiler tools are installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+/******************************************************************************
+* Find the OpenCL installation
+******************************************************************************/
+char *get_ocl_install()
+{
+ char *install = getenv("TI_OCL_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the TI OpenCL product is installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+std::string get_ocl_dsp()
+{
+ const char *stdpath = "/usr/share/ti/opencl/dsp";
+
+ struct stat st;
+ stat(stdpath, &st);
+ if (S_ISDIR(st.st_mode)) return stdpath;
+
+ std::string sinstall = string(get_ocl_install()) + "/dsp";
+ return sinstall;
+}
+
+/******************************************************************************
+* run_cl6x
+******************************************************************************/
+int run_cl6x(string filename, string *llvm_bitcode, string addl_files)
+{
+ string command("cl6x --f -q --abi=eabi --use_g3 -mv6600 -mt -mo ");
+
+ if (opt_keep) command += "-mw -k --z ";
+
+ command += "--disable:sploop ";
+
+ if (opt_debug) command += "-g -o0 ";
+ else command += "-o3 ";
+
+ const char *cgt_install = get_cgt_install();
+
+ command += "-I"; command += cgt_install; command += "/include ";
+ command += "-I"; command += cgt_install; command += "/lib ";
+ command += "-I"; command += get_ocl_dsp().c_str(); command += " ";
+
+ command += "--bc_file="; command += filename; command += " ";
+
+ /*-------------------------------------------------------------------------
+ * Encode LLVM bitcode as bytes in the .llvmir section of the .asm file
+ *------------------------------------------------------------------------*/
+ if (llvm_bitcode != NULL)
+ {
+ string bitasm_name(fs_stem(filename));
+ bitasm_name += "_bc.asm";
+
+ ofstream outasmfile(bitasm_name.c_str(), ios::out);
+ outasmfile << "\t.sect \".llvmir\"\n" << "\t.retain";
+ int nbytes = llvm_bitcode->size();
+ for (int i = 0; i < nbytes; i++)
+ if (i % 10 == 0)
+ outasmfile << "\n\t.byte " << (int) llvm_bitcode->at(i);
+ else
+ outasmfile << ", " << (int) llvm_bitcode->at(i);
+ outasmfile.close();
+
+ command += bitasm_name; command += " ";
+ }
+
+ if (opt_lib)
+ {
+ if (opt_verbose) cout << command << endl;
+ int x = system(command.c_str());
+ return true;
+ }
+
+ string outfile(fs_replace_extension(filename, ".out"));
+
+ command += "-z ";
+ command += "-o ";
+ command += outfile;
+ command += " ";
+
+ if (opt_keep)
+ {
+ command += "-m ";
+ command += fs_replace_extension(filename, ".map");
+ command += " ";
+ }
+
+ /*-------------------------------------------------------------------------
+ * Any libraries or object files need to go last to resolve references
+ *------------------------------------------------------------------------*/
+ command += addl_files;
+ command += " -ldsp.syms ";
+
+ if (opt_verbose) cout << command << endl;
+ int x = system(command.c_str());
+
+ if (!opt_debug)
+ {
+ string strip_command("strip6x ");
+ strip_command += outfile;
+ if (opt_verbose) cout << strip_command << endl;
+ x = system(strip_command.c_str());
+ }
+}
diff --git a/cmake/CMakeARMToolChain.txt b/cmake/CMakeARMToolChain.txt
new file mode 100644
index 0000000..9d6847b
--- /dev/null
+++ b/cmake/CMakeARMToolChain.txt
@@ -0,0 +1,53 @@
+# this one is important
+SET(CMAKE_SYSTEM_NAME Linux)
+#this one not so much
+SET(CMAKE_SYSTEM_VERSION 1)
+
+SET(CMAKE_SYSTEM_PROCESSOR ARM)
+SET(HAWKING_CROSS_COMPILE ON)
+
+# specify the cross compiler
+SET(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+SET(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+
+# For external builds, paths to dependent libraries/packages are assumed to
+# be specified by environment variables.
+# For internal development builds setup a default path for dependent libs/pkgs
+if ("$ENV{DEFAULT_DEV_INSTALL_DIR}" STREQUAL "")
+ set(DEFAULT_DEV_INSTALL_DIR /opt/ti)
+else()
+ set(DEFAULT_DEV_INSTALL_DIR $ENV{DEFAULT_DEV_INSTALL_DIR})
+endif()
+
+# Point to the development kit file system where we should get ARM
+# libraries and headers
+if ("$ENV{LINUX_DEVKIT_ROOT}" STREQUAL "")
+ SET(LINUX_DEVKIT_ROOT ${DEFAULT_DEV_INSTALL_DIR}/mcsdk_linux_3_00_04_18/linux-devkit/arago-2013.12/sysroots/cortexa15hf-vfp-neon-3.8-oe-linux-gnueabi)
+ MESSAGE(STATUS "Environment variable LINUX_DEVKIT_ROOT not set. "
+ "Assuming the linux devkit filesystem is installed "
+ "at ${LINUX_DEVKIT_ROOT}")
+else()
+ SET(LINUX_DEVKIT_ROOT $ENV{LINUX_DEVKIT_ROOT})
+endif()
+
+
+SET(CMAKE_FIND_ROOT_PATH ${LINUX_DEVKIT_ROOT}/usr/include
+ ${LINUX_DEVKIT_ROOT}/usr/lib )
+MESSAGE(STATUS "Devkit installation at " ${LINUX_DEVKIT_ROOT})
+
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH true)
+
+# The OpenCL package has dependencies on some non-system headers (BOOST, GL)
+# The cross compiler will not include the host machine's /usr/include
+# directory in it's default search so we need to explicitly add it to
+# include_directories(). HOWEVER, CMAKE will ignore attempts to
+# add '/usr/include' to include_directories(). Hence the '../' in the
+# path specified here.
+# NOTE: Ensure this appears in the include list after target system headers
+# (search for references to CMAKE_FIND_ROOT_PATH)
+set(HOST_USR_INCLUDE_PATH /usr/include/../include)
diff --git a/cmake/modules/FindCheck.cmake b/cmake/modules/FindCheck.cmake
new file mode 100644
index 0000000..d7a5bcd
--- /dev/null
+++ b/cmake/modules/FindCheck.cmake
@@ -0,0 +1,57 @@
+# - Try to find the CHECK libraries
+# Once done this will define
+#
+# Note: This module is originally found in opensync project
+#
+# CHECK_FOUND - system has check
+# CHECK_INCLUDE_DIRS - the check include directory
+# CHECK_LIBRARIES - check library
+#
+# Copyright (c) 2007 Daniel Gollub <gollub@b1-systems.de>
+# Copyright (c) 2007-2009 Bjoern Ricks <bjoern.ricks@gmail.com>
+#
+# Redistribution and use is allowed according to the terms of the New
+# BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+
+INCLUDE( FindPkgConfig )
+
+IF ( Check_FIND_REQUIRED )
+ SET( _pkgconfig_REQUIRED "REQUIRED" )
+ELSE( Check_FIND_REQUIRED )
+ SET( _pkgconfig_REQUIRED "" )
+ENDIF ( Check_FIND_REQUIRED )
+
+IF ( CHECK_MIN_VERSION )
+ PKG_SEARCH_MODULE( CHECK ${_pkgconfig_REQUIRED} check>=${CHECK_MIN_VERSION} )
+ELSE ( CHECK_MIN_VERSION )
+ PKG_SEARCH_MODULE( CHECK ${_pkgconfig_REQUIRED} check )
+ENDIF ( CHECK_MIN_VERSION )
+
+# Look for CHECK include dir and libraries
+IF( NOT CHECK_FOUND AND NOT PKG_CONFIG_FOUND )
+
+ FIND_PATH( CHECK_INCLUDE_DIRS check.h )
+
+ FIND_LIBRARY( CHECK_LIBRARIES NAMES check )
+
+ IF ( CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES )
+ SET( CHECK_FOUND 1 )
+ IF ( NOT Check_FIND_QUIETLY )
+ MESSAGE ( STATUS "Found CHECK: ${CHECK_LIBRARIES}" )
+ ENDIF ( NOT Check_FIND_QUIETLY )
+ ELSE ( CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES )
+ IF ( Check_FIND_REQUIRED )
+ MESSAGE( FATAL_ERROR "Could NOT find CHECK" )
+ ELSE ( Check_FIND_REQUIRED )
+ IF ( NOT Check_FIND_QUIETLY )
+ MESSAGE( STATUS "Could NOT find CHECK" )
+ ENDIF ( NOT Check_FIND_QUIETLY )
+ ENDIF ( Check_FIND_REQUIRED )
+ ENDIF ( CHECK_INCLUDE_DIRS AND CHECK_LIBRARIES )
+ENDIF( NOT CHECK_FOUND AND NOT PKG_CONFIG_FOUND )
+
+# Hide advanced variables from CMake GUIs
+MARK_AS_ADVANCED( CHECK_INCLUDE_DIRS CHECK_LIBRARIES )
+
diff --git a/cmake/modules/FindClang.cmake b/cmake/modules/FindClang.cmake
new file mode 100644
index 0000000..b4ca3fc
--- /dev/null
+++ b/cmake/modules/FindClang.cmake
@@ -0,0 +1,73 @@
+# Detect CLANG
+if (NOT LLVM_INCLUDE_DIR OR NOT LLVM_LIB_DIR)
+ message(FATAL_ERROR "No LLVM, Clang support requires LLVM")
+else (NOT LLVM_INCLUDE_DIR OR NOT LLVM_LIB_DIR)
+
+MACRO(FIND_AND_ADD_CLANG_LIB _libname_)
+find_library(CLANG_${_libname_}_LIB NAMES ${_libname_} HINTS ${LLVM_LIB_DIR} NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH)
+if (CLANG_${_libname_}_LIB)
+ set(CLANG_LIBS ${CLANG_LIBS} ${CLANG_${_libname_}_LIB})
+endif(CLANG_${_libname_}_LIB)
+ENDMACRO(FIND_AND_ADD_CLANG_LIB)
+
+message(STATUS "LLVM_LIB_DIR: " ${LLVM_LIB_DIR})
+set(CLANG_INCLUDE_DIRS ${CLANG_INCLUDE_DIRS} ${LLVM_INCLUDE_DIR})
+
+
+FIND_AND_ADD_CLANG_LIB(clangFrontendTool)
+FIND_AND_ADD_CLANG_LIB(clangFrontend)
+FIND_AND_ADD_CLANG_LIB(clangDriver)
+FIND_AND_ADD_CLANG_LIB(clangSerialization)
+FIND_AND_ADD_CLANG_LIB(clangCodeGen)
+FIND_AND_ADD_CLANG_LIB(clangParse)
+FIND_AND_ADD_CLANG_LIB(clangSema)
+FIND_AND_ADD_CLANG_LIB(clangEdit)
+FIND_AND_ADD_CLANG_LIB(clangAnalysis)
+#FIND_AND_ADD_CLANG_LIB(clangIndex) Removed in 3.1
+#FIND_AND_ADD_CLANG_LIB(clangRewrite) Now clangRewriteCore,clangRewriteFrontend
+FIND_AND_ADD_CLANG_LIB(clangAST)
+FIND_AND_ADD_CLANG_LIB(clangLex)
+FIND_AND_ADD_CLANG_LIB(clangBasic)
+
+MESSAGE(STATUS "Clang libs: " ${CLANG_LIBS})
+
+if (SHAMROCK_BUILD)
+if (LLVM_BIN_DIR)
+ set(CLANG_EXECUTABLE ${LLVM_BIN_DIR}/clang)
+else (LLVM_BIN_DIR)
+ set(CLANG_EXECUTABLE clang)
+endif(LLVM_BIN_DIR)
+
+else(SHAMROCK_BUILD)
+
+# We build using a specific version of llvm/clang so point to it instead
+# of using any installed version (see FindLLVM.cmake)
+if (HAWKING_CROSS_COMPILE OR SHANNON_BUILD)
+ set (CLANG_INSTALL_DIR ${X86_LLVM_DIR})
+else()
+ set (CLANG_INSTALL_DIR ${ARM_LLVM_DIR})
+endif()
+
+find_program(CLANG_EXECUTABLE
+ NAMES clang
+ PATHS
+ ${CLANG_INSTALL_DIR}/bin
+ NO_DEFAULT_PATH
+)
+endif(SHAMROCK_BUILD)
+
+
+if(CLANG_EXECUTABLE)
+ set(CLANG_FOUND TRUE)
+endif(CLANG_EXECUTABLE)
+
+if(CLANG_FOUND)
+ message(STATUS "Found Clang: ${CLANG_EXECUTABLE}")
+else(CLANG_FOUND)
+ if(CLANG_FIND_REQUIRED)
+ message(FATAL_ERROR "Could NOT find Clang")
+ endif(CLANG_FIND_REQUIRED)
+endif(CLANG_FOUND)
+
+
+endif (NOT LLVM_INCLUDE_DIR OR NOT LLVM_LIB_DIR)
diff --git a/cmake/modules/FindLLVM.cmake b/cmake/modules/FindLLVM.cmake
new file mode 100644
index 0000000..18a44c9
--- /dev/null
+++ b/cmake/modules/FindLLVM.cmake
@@ -0,0 +1,168 @@
+# Detect LLVM and set various variables to link against the different
+# components of LLVM
+#
+# NOTE: This is a modified version of the module originally found in the
+# OpenGTL project at www.opengtl.org
+#
+# LLVM_BIN_DIR : directory with LLVM binaries
+# LLVM_LIB_DIR : directory with LLVM library
+# LLVM_INCLUDE_DIR : directory with LLVM include
+#
+# LLVM_COMPILE_FLAGS : compile flags to build a program using LLVM headers
+# LLVM_LDFLAGS : ldflags needed to link
+# LLVM_LIBS_CORE : ldflags needed to link against a LLVM core library
+
+if (LLVM_INCLUDE_DIR)
+ set(LLVM_FOUND TRUE)
+else (LLVM_INCLUDE_DIR)
+
+# Set up variables to point to an LLVM installation for the desired target
+# (x86 or arm). For now, on Shannon builds, we build 32 vs 64 x86 based on the
+# machine we're building on. Future work is to specify a 32/64-bit x86 build
+# independent on the machine we're building on.
+
+# Use uname to get build platform. The paths to the target LLVM builds have
+# arm/x86/x86_64 in the name so munge to match our convention
+EXEC_PROGRAM(uname ARGS -m OUTPUT_VARIABLE BUILD_PROCESSOR)
+STRING(REGEX MATCH "^arm" IS_ARM_HOST ${BUILD_PROCESSOR})
+if(IS_ARM_HOST)
+ set(LLVM_HOST_PROCESSOR arm)
+endif()
+
+STRING(REGEX MATCH "^i686" IS_X86_HOST ${BUILD_PROCESSOR})
+if(IS_X86_HOST)
+ set(LLVM_HOST_PROCESSOR x86)
+endif()
+
+STRING(REGEX MATCH "^x86_64" IS_X86_64_HOST ${BUILD_PROCESSOR})
+if(IS_X86_64_HOST)
+ # For Hawking just use the x86 version when running clang
+ if (HAWKING_BUILD)
+ set(LLVM_HOST_PROCESSOR x86)
+ else()
+ set(LLVM_HOST_PROCESSOR x86_64)
+ endif()
+endif()
+
+# Version of LLVM we are currently based off of
+set(LLVM_VERSION 350)
+
+if (NOT SHAMROCK_BUILD)
+# Set up llvm paths, using environment variables if defined
+if ("$ENV{ARM_LLVM_DIR}" STREQUAL "")
+ set(ARM_LLVM_DIR ${DEFAULT_DEV_INSTALL_DIR}/llvm${LLVM_VERSION}-install-arm)
+ MESSAGE(STATUS "Environment variable ARM_LLVM_DIR not set. "
+ "Assuming that the OpenCL ARM LLVM installation is at ${ARM_LLVM_DIR}")
+else()
+ set (ARM_LLVM_DIR $ENV{ARM_LLVM_DIR})
+endif()
+
+if (HAWKING_CROSS_COMPILE OR SHANNON)
+if ("$ENV{X86_LLVM_DIR}" STREQUAL "")
+ set (X86_LLVM_DIR ${DEFAULT_DEV_INSTALL_DIR}/llvm${LLVM_VERSION}-install-${LLVM_HOST_PROCESSOR})
+ MESSAGE(STATUS "Environment variable X86_LLVM_DIR not set. "
+ "Assuming that the OpenCL x86 LLVM installation is at ${X86_LLVM_DIR}")
+else()
+ set (X86_LLVM_DIR $ENV{X86_LLVM_DIR})
+endif()
+endif()
+
+# Set llvm path to appropriate target llvm install
+if (HAWKING_BUILD)
+ set (LLVM_INSTALL_DIR ${ARM_LLVM_DIR})
+elseif(SHANNON_BUILD)
+ set (LLVM_INSTALL_DIR ${X86_LLVM_DIR})
+endif()
+message(STATUS "LLVM installation is in ${LLVM_INSTALL_DIR}")
+endif(NOT SHAMROCK_BUILD)
+
+# Find appropriate llvm-config executable
+if (HAWKING_CROSS_COMPILE)
+ set(LLVM_CONFIG_NAME llvm-config-host)
+else()
+ set(LLVM_CONFIG_NAME llvm-config)
+endif()
+
+find_program(LLVM_CONFIG_EXECUTABLE
+ NAMES ${LLVM_CONFIG_NAME}
+ PATHS
+ ${LLVM_INSTALL_DIR}/bin
+ ${LLVM_CONFIG_PATH}
+ /usr/bin
+ /usr/local/bin
+ /opt/local/bin
+)
+
+# Sanity check to ensure we're pointing at an LLVM version we think we are
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --version OUTPUT_VARIABLE REPORTED_LLVM_VERSION )
+
+STRING(REPLACE "." "" REPORTED_LLVM_VERSION ${REPORTED_LLVM_VERSION})
+if(NOT ${REPORTED_LLVM_VERSION} STREQUAL ${LLVM_VERSION})
+ message(FATAL_ERROR "ERROR!: llvm-config reports different version that what is expected \(${REPORTED_LLVM_VERSION} != ${LLVM_VERSION}"\))
+endif()
+
+# Macro to build up list of llvm libraries
+MACRO(FIND_LLVM_LIBS LLVM_CONFIG_EXECUTABLE _libname_ LIB_VAR OBJECT_VAR)
+ exec_program( ${LLVM_CONFIG_EXECUTABLE} ARGS --libs ${_libname_} OUTPUT_VARIABLE ${LIB_VAR} )
+ STRING(REGEX MATCHALL "[^ ]*[.]o[ $]" ${OBJECT_VAR} ${${LIB_VAR}})
+ SEPARATE_ARGUMENTS(${OBJECT_VAR})
+ STRING(REGEX REPLACE "[^ ]*[.]o[ $]" "" ${LIB_VAR} ${${LIB_VAR}})
+ENDMACRO(FIND_LLVM_LIBS)
+
+
+# Set up LLVM paths
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --version OUTPUT_VARIABLE LLVM_VERSION )
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --bindir OUTPUT_VARIABLE LLVM_BIN_DIR )
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --libdir OUTPUT_VARIABLE LLVM_LIB_DIR )
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --includedir OUTPUT_VARIABLE LLVM_INCLUDE_DIR )
+
+MESSAGE(STATUS "LLVM VERSION " ${LLVM_VERSION})
+MESSAGE(STATUS "LLVM BIN DIR " ${LLVM_BIN_DIR})
+MESSAGE(STATUS "LLVM LIB DIR " ${LLVM_LIB_DIR})
+MESSAGE(STATUS "LLVM INCLUDE_DIR DIR " ${LLVM_INCLUDE_DIR})
+
+# Set up compile/link flags
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --ldflags OUTPUT_VARIABLE LLVM_LDFLAGS )
+STRING(REPLACE " -lz" "" LLVM_LDFLAGS ${LLVM_LDFLAGS})
+
+
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --cxxflags OUTPUT_VARIABLE LLVM_COMPILE_FLAGS )
+
+STRING(REPLACE " -fno-rtti" "" LLVM_COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+STRING(REPLACE " -fno-exceptions" "" LLVM_COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+STRING(REPLACE " -Wcast-qual" "" LLVM_COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+
+# Do a case insensitive check for "Debug" build, then remove "-O3" if so:
+SET( temp ${CMAKE_BUILD_TYPE})
+STRING(TOLOWER "${temp}" temp_lower)
+if( temp_lower STREQUAL "debug" )
+ STRING(REPLACE " -O3" "" LLVM_COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+ENDIF(temp_lower STREQUAL "debug" )
+
+MESSAGE(STATUS "LLVM CXX flags: " ${LLVM_COMPILE_FLAGS})
+MESSAGE(STATUS "LLVM LD flags: " ${LLVM_LDFLAGS})
+
+
+# Generate list of LLVM libraries to link against
+if (SHANNON_BUILD)
+ set (LLVM_LIB_TARGET X86)
+elseif(HAWKING_BUILD OR SHARMROCK_BUILD)
+ set (LLVM_LIB_TARGET ARM)
+endif()
+
+exec_program(${LLVM_CONFIG_EXECUTABLE} ARGS --libs ${LLVM_LIB_TARGET} asmparser native bitwriter tablegen jit mcjit debuginfo interpreter linker irreader instrumentation ipo mcdisassembler option objcarcopts profiledata OUTPUT_VARIABLE LLVM_LIBS_CORE )
+MESSAGE(STATUS "LLVM core libs: " ${LLVM_LIBS_CORE})
+
+if(LLVM_INCLUDE_DIR)
+ set(LLVM_FOUND TRUE)
+endif(LLVM_INCLUDE_DIR)
+
+if(LLVM_FOUND)
+ message(STATUS "Found LLVM: ${LLVM_INCLUDE_DIR}")
+else(LLVM_FOUND)
+ if(LLVM_FIND_REQUIRED)
+ message(FATAL_ERROR "Could NOT find LLVM")
+ endif(LLVM_FIND_REQUIRED)
+endif(LLVM_FOUND)
+
+endif (LLVM_INCLUDE_DIR)
diff --git a/cmem/Makefile b/cmem/Makefile
new file mode 100644
index 0000000..f08b611
--- /dev/null
+++ b/cmem/Makefile
@@ -0,0 +1,49 @@
+#*
+#*
+#* Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
+#*
+#*
+#* Redistribution and use in source and binary forms, with or without
+#* modification, are permitted provided that the following conditions
+#* are met:
+#*
+#* Redistributions of source code must retain the above copyright
+#* notice, this list of conditions and the following disclaimer.
+#*
+#* Redistributions in binary form must reproduce the above copyright
+#* notice, this list of conditions and the following disclaimer in the
+#* documentation and/or other materials provided with the
+#* distribution.
+#*
+#* Neither the name of Texas Instruments Incorporated nor the names of
+#* its contributors may be used to endorse or promote products derived
+#* from this software without specific prior written permission.
+#*
+#*
+#*
+#* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#*
+
+CFILES := cmem.c
+obj-m := cmem_dev.o
+
+KVERSION := $(shell uname -r)
+cmem_dev-objs := $(CFILES:.c=.o)
+
+KDIR := /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+all:
+ $(MAKE) -C $(KDIR) M=$(PWD) modules
+clean:
+ $(MAKE) -C $(KDIR) M=$(PWD) clean
diff --git a/cmem/cmem.c b/cmem/cmem.c
new file mode 100644
index 0000000..65c2e70
--- /dev/null
+++ b/cmem/cmem.c
@@ -0,0 +1,668 @@
+/*
+ *
+ * Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+*/
+
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+
+#include <linux/dmaengine.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+
+#include "cmemcfg.h"
+
+#ifdef CMEM_CFG_USE_DMA_COHERENT_ALLOC
+#define PCI_ALLOC
+#endif
+#ifdef PCI_ALLOC /* TODO: Need to remove dependance on PCI */
+#include <linux/pci.h>
+#endif
+
+#include "cmem.h"
+
+//#define DEBUG_ON
+static dev_t cmem_dev_id;
+static struct cdev cmem_cdev;
+static struct class *cmem_class;
+static int cmem_major;
+static int cmem_minor;
+
+struct device *cmem_dev;
+cmem_host_buf_info_t *cmem_pers_host_buf_info;
+cmem_host_buf_info_t *cmem_dyn_host_buf_info;
+
+
+static spinlock_t l_lock;
+static wait_queue_head_t l_read_wait;
+
+#ifdef PCI_ALLOC /* TODO : temporarily keeping it here to get DMA buffer associated with device driver */
+struct pci_dev *ti667x_pci_dev[1];
+#define TI667X_PCI_VENDOR_ID 0x104c /* TI */
+#define TI667X_PCI_DEVICE_ID 0xb005 /* C6678 */
+#define TI667X_PCIE_DRVNAME "ti6678_pcie_ep"
+
+/**
+* ti667x_ep_find_device() - Look-up for available TI667X Endpoint
+*
+* Since we could even be running on another TI667X device acting as RC, we need
+* to skip it - this is done based on checking device class to be set as "PCI
+* Bridge" for RC as the RC driver does this setting during enumeration.
+*
+* Note: This checking needs to be updated if RC driver is changed to set (or
+* not to set) class differently.
+*/
+static int ti667x_ep_find_device(void)
+{
+ struct pci_dev *dev;
+
+ ti667x_pci_dev[0] = NULL;
+
+ dev = pci_get_device(TI667X_PCI_VENDOR_ID, TI667X_PCI_DEVICE_ID, NULL);
+ if (NULL != dev)
+ {
+
+ pr_info(TI667X_PCIE_DRVNAME ": Found TI667x PCIe EP @0x%p\n", dev);
+ while ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)
+ {
+ pr_warning(TI667X_PCIE_DRVNAME ": skipping TI667x PCIe RC...\n");
+ dev = pci_get_device(TI667X_PCI_VENDOR_ID, TI667X_PCI_DEVICE_ID, dev);
+ if(NULL==dev) {
+ pr_info(TI667X_PCIE_DRVNAME ": No non bridge TI PCI device found @0x%p\n", dev);
+ return (-1);
+ }
+ continue;
+ }
+
+ ti667x_pci_dev[0] = dev;
+ return 0;
+ }
+
+ pr_info(TI667X_PCIE_DRVNAME ": No TI PCI device found @0x%p\n", dev);
+ return -1;
+}
+#else
+
+typedef struct _reserved_mem_area_t {
+ uint64_t start_addr;
+ size_t size;
+} reserved_mem_area_t;
+
+static reserved_mem_area_t persistent_mem_area;
+static reserved_mem_area_t dynamic_mem_area;
+
+uint64_t host_buf_alloc_ptr;
+uint64_t host_dyn_buf_alloc_ptr;
+#endif
+
+/**
+* cmem_ioctl() - Application interface for cmem module
+*
+*
+*/
+long cmem_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ int ret = 0 ;
+
+ switch (cmd) {
+ case CMEM_IOCTL_ALLOC_HOST_BUFFERS:
+ {
+ int i;
+
+ cmem_ioctl_host_buf_info_t *host_buf_info = &(((cmem_ioctl_t *) arg)->host_buf_info);
+ if(host_buf_info->type == 0) {
+ /* Consistent buffer allocation */
+
+#ifndef PCI_ALLOC
+ /* All previous allocations removed */
+ host_buf_alloc_ptr = (persistent_mem_area.start_addr);
+#endif
+ if(!cmem_pers_host_buf_info)
+ {
+ cmem_pers_host_buf_info = kmalloc(sizeof(cmem_host_buf_info_t), GFP_KERNEL);
+ if(cmem_pers_host_buf_info == NULL)
+ {
+ dev_err(cmem_dev, "kmalloc 1 failed ");
+ return(-1);
+ }
+ memset(cmem_pers_host_buf_info, 0, sizeof(cmem_host_buf_info_t));
+ cmem_pers_host_buf_info->buf_info
+ = kmalloc((host_buf_info->num_buffers*sizeof(cmem_host_buf_entry_t)), GFP_KERNEL);
+ if(cmem_pers_host_buf_info->buf_info == NULL)
+ {
+ dev_err(cmem_dev, "kmalloc 2 failed ");
+ goto err_kmalloc2;
+ }
+ memset(cmem_pers_host_buf_info->buf_info, 0,
+ (host_buf_info->num_buffers*sizeof(cmem_host_buf_entry_t)));
+ }
+ /* If already allocated , just return the already allocated addresses */
+ if(cmem_pers_host_buf_info->num_buffers != 0)
+ {
+ if(host_buf_info->num_buffers > cmem_pers_host_buf_info->num_buffers)
+ {
+ dev_err(cmem_dev, "Failed number of buffer exceed from previous allocation; Previous %d, current %x\n", cmem_pers_host_buf_info->num_buffers, host_buf_info->num_buffers);
+ return(-1);
+ }
+ for(i =0; i < host_buf_info->num_buffers; i++) {
+ if( host_buf_info->buf_info[i].length > cmem_pers_host_buf_info->buf_info[i].length)
+ {
+ dev_err(cmem_dev, "Failed length mismatch with previous allocation %d\n", i);
+ return(-1);
+ }
+ host_buf_info->buf_info[i].virtAddr = cmem_pers_host_buf_info->buf_info[i].virtAddr;
+ host_buf_info->buf_info[i].dmaAddr = cmem_pers_host_buf_info->buf_info[i].dmaAddr;
+ dev_info(cmem_dev,
+ "Returning previously allocated Persistent Host memory in Pcie space %d: Base Address: 0x%llx: Virtual Address 0x%p : Size 0x%x \n",
+ i, host_buf_info->buf_info[i].dmaAddr,
+ host_buf_info->buf_info[i].virtAddr,
+ (unsigned int)host_buf_info->buf_info[i].length);
+ }
+ break;
+ }
+ /* Do New allocation */
+ for(i =0; i < host_buf_info->num_buffers; i++) {
+#ifdef PCI_ALLOC
+ host_buf_info->buf_info[i].virtAddr = (uint8_t *)dma_alloc_coherent(&ti667x_pci_dev[0]->dev, host_buf_info->buf_info[i].length, (dma_addr_t *)&host_buf_info->buf_info[i].dmaAddr, GFP_KERNEL);
+#else
+ host_buf_info->buf_info[i].virtAddr = 0;
+ if((host_buf_alloc_ptr+host_buf_info->buf_info[i].length) >
+ (persistent_mem_area.start_addr + persistent_mem_area.size)) {
+ host_buf_info->buf_info[i].dmaAddr = 0;
+ } else {
+ host_buf_info->buf_info[i].dmaAddr = host_buf_alloc_ptr;
+ }
+ host_buf_alloc_ptr += host_buf_info->buf_info[i].length;
+#endif
+ if(host_buf_info->buf_info[i].dmaAddr == 0 ) {
+#ifdef PCI_ALLOC
+ dev_err(&ti667x_pci_dev[0]->dev, "Failed allocation of Persistent Host memory %d\n", i);
+#else
+ dev_err(cmem_dev, "Failed allocation of Persistent Host memory %d\n", i);
+#endif
+ return (-1);
+ } else {
+ dev_info(cmem_dev,
+ "Allocated Persistent Host memory in Pcie space %d: Base Address: 0x%llx: Virtual Address 0x%p : Size 0x%x \n",
+ i, host_buf_info->buf_info[i].dmaAddr,
+ host_buf_info->buf_info[i].virtAddr,
+ (unsigned int)host_buf_info->buf_info[i].length);
+ }
+ cmem_pers_host_buf_info->buf_info[i] = host_buf_info->buf_info[i];
+ }
+
+ cmem_pers_host_buf_info->num_buffers = host_buf_info->num_buffers;
+ } else {
+ /* Dynamic buffer allocation */
+ if(cmem_dyn_host_buf_info)
+ {
+ cmem_host_buf_entry_t *tmp_buf_infoP;
+
+ tmp_buf_infoP
+ = kmalloc(((cmem_dyn_host_buf_info->num_buffers+host_buf_info->num_buffers)
+ *sizeof(cmem_host_buf_entry_t)), GFP_KERNEL);
+ if(tmp_buf_infoP == NULL)
+ {
+ dev_err(cmem_dev, "kmalloc 5 failed ");
+ return(-1);
+ }
+ memset(tmp_buf_infoP, 0,
+ ((cmem_dyn_host_buf_info->num_buffers+host_buf_info->num_buffers)
+ *sizeof(cmem_host_buf_entry_t)));
+ /* copy and free old buffer list */
+ memcpy(tmp_buf_infoP, cmem_dyn_host_buf_info->buf_info,
+ cmem_dyn_host_buf_info->num_buffers*sizeof(cmem_host_buf_entry_t));
+ kfree(cmem_dyn_host_buf_info->buf_info);
+ cmem_dyn_host_buf_info->buf_info = tmp_buf_infoP;
+
+ }
+ else
+ {
+ cmem_dyn_host_buf_info = kmalloc(sizeof(cmem_host_buf_info_t), GFP_KERNEL);
+ if(cmem_dyn_host_buf_info == NULL)
+ {
+ dev_err(cmem_dev, "kmalloc 3 failed ");
+ return(-1);
+ }
+
+ memset(cmem_dyn_host_buf_info, 0, sizeof(cmem_host_buf_info_t));
+ cmem_dyn_host_buf_info->buf_info
+ = kmalloc((host_buf_info->num_buffers*sizeof(cmem_host_buf_entry_t)), GFP_KERNEL);
+ if(cmem_dyn_host_buf_info->buf_info == NULL)
+ {
+ dev_err(cmem_dev, "kmalloc 4 failed ");
+ goto err_kmalloc4;
+ }
+ memset(cmem_dyn_host_buf_info->buf_info, 0,
+ (host_buf_info->num_buffers*sizeof(cmem_host_buf_entry_t)));
+ }
+ for(i =0; i < host_buf_info->num_buffers; i++)
+ {
+#ifdef PCI_ALLOC
+ host_buf_info->buf_info[i].virtAddr = (uint8_t *)dma_alloc_coherent(&ti667x_pci_dev[0]->dev, host_buf_info->buf_info[i].length, (dma_addr_t *)&host_buf_info->buf_info[i].dmaAddr, GFP_KERNEL);
+#else
+ host_buf_info->buf_info[i].virtAddr = 0;
+ if((host_dyn_buf_alloc_ptr+host_buf_info->buf_info[i].length) >
+ (dynamic_mem_area.start_addr + dynamic_mem_area.size)) {
+ host_buf_info->buf_info[i].dmaAddr = 0;
+ } else {
+ host_buf_info->buf_info[i].dmaAddr = host_dyn_buf_alloc_ptr;
+ }
+
+ host_dyn_buf_alloc_ptr += host_buf_info->buf_info[i].length;
+
+#endif
+ if(host_buf_info->buf_info[i].dmaAddr == 0 ) {
+#ifdef PCI_ALLOC
+ dev_err(&ti667x_pci_dev[0]->dev, "Failed allocation of Dynamic Host memory %d\n", i);
+#else
+ dev_err(cmem_dev, "Failed allocation of Dynamic Host memory %d\n", i);
+
+#endif
+ return (-1);
+ } else {
+ dev_info(cmem_dev,
+ "Allocated Host memory in Pcie space %d: Base Address: 0x%llx: Virtual Address 0x%p : Size 0x%x \n",
+ i, host_buf_info->buf_info[i].dmaAddr,
+ host_buf_info->buf_info[i].virtAddr,
+ (unsigned int)host_buf_info->buf_info[i].length);
+ }
+
+ /* Keep a local copy of this in driver */
+ cmem_dyn_host_buf_info->buf_info[cmem_dyn_host_buf_info->num_buffers] = host_buf_info->buf_info[i];
+ dev_info(cmem_dev, " Copied buffer %d, Base address: 0x%llx: Size 0x%x \n",
+ cmem_dyn_host_buf_info->num_buffers, cmem_dyn_host_buf_info->buf_info[cmem_dyn_host_buf_info->num_buffers].dmaAddr,
+ cmem_dyn_host_buf_info->buf_info[cmem_dyn_host_buf_info->num_buffers].length);
+ cmem_dyn_host_buf_info->num_buffers++;
+ }
+ }
+ }
+ break;
+
+ case CMEM_IOCTL_GET_HOST_BUF_INFO:
+ {
+ cmem_host_buf_info_t *host_buf_info
+ = (cmem_host_buf_info_t *) arg;
+ if(cmem_pers_host_buf_info)
+ {
+ int i=0;
+ /* TODO: copy the buffer info */
+ *(host_buf_info) = *cmem_pers_host_buf_info;
+ memcpy(host_buf_info->buf_info, &cmem_pers_host_buf_info->buf_info[i],
+ CMEM_MAX_BUF_PER_ALLOC*sizeof(cmem_host_buf_entry_t));
+ }
+ }
+ break;
+
+ case CMEM_IOCTL_FREE_HOST_BUFFERS:
+ {
+ /* TODO: Currently free clears off all buffers :
+ May need to provide dynamic alloc and free later*/
+ cmem_host_buf_info_t *host_buf_info = (cmem_host_buf_info_t *) arg;
+ if(host_buf_info->type == 0) {
+#ifdef PCI_ALLOC
+ int i;
+
+ if(cmem_pers_host_buf_info) {
+ for(i =0; i < cmem_pers_host_buf_info->num_buffers; i++) {
+ if(cmem_pers_host_buf_info->buf_info[i].dmaAddr != 0) {
+ dma_free_coherent(&ti667x_pci_dev[0]->dev,
+ cmem_pers_host_buf_info->buf_info[i].length,
+ cmem_pers_host_buf_info->buf_info[i].virtAddr,
+ cmem_pers_host_buf_info->buf_info[i].dmaAddr);
+ dev_info(&ti667x_pci_dev[0]->dev,
+ "Freed Host memory %d: Base Address: 0x%llx Size : 0x%x\n",
+ i, cmem_pers_host_buf_info->buf_info[i].dmaAddr,
+ cmem_pers_host_buf_info->buf_info[i].length);
+ }
+ }
+ }
+#else
+ host_buf_alloc_ptr = (persistent_mem_area.start_addr);
+#endif
+
+ if(cmem_pers_host_buf_info) {
+ if(cmem_pers_host_buf_info->buf_info)
+ kfree(cmem_pers_host_buf_info->buf_info);
+ kfree(cmem_pers_host_buf_info);
+ cmem_pers_host_buf_info = NULL;
+ }
+ dev_info(cmem_dev,
+ "Freed all Persistent memory allocation \n");
+
+ } else {
+#ifdef PCI_ALLOC
+ int i;
+ if(cmem_dyn_host_buf_info) {
+ for(i =0; i < cmem_dyn_host_buf_info->num_buffers; i++) {
+ if(cmem_dyn_host_buf_info->buf_info[i].dmaAddr != 0) {
+ dma_free_coherent(&ti667x_pci_dev[0]->dev,
+ cmem_dyn_host_buf_info->buf_info[i].length,
+ cmem_dyn_host_buf_info->buf_info[i].virtAddr,
+ cmem_dyn_host_buf_info->buf_info[i].dmaAddr);
+ dev_info(&ti667x_pci_dev[0]->dev,
+ "Freed Host memory %d: Base Address: 0x%llx Size : 0x%x\n",
+ i, cmem_dyn_host_buf_info->buf_info[i].dmaAddr,
+ cmem_dyn_host_buf_info->buf_info[i].length);
+ }
+ }
+ }
+#else
+ host_dyn_buf_alloc_ptr = (dynamic_mem_area.start_addr);
+#endif
+ if(cmem_dyn_host_buf_info)
+ {
+ if(cmem_dyn_host_buf_info->buf_info)
+ kfree(cmem_dyn_host_buf_info->buf_info);
+ kfree(cmem_dyn_host_buf_info);
+ cmem_dyn_host_buf_info = NULL;
+ }
+ dev_info(cmem_dev,
+ "Freed all Dynamic memory allocation \n");
+
+ }
+ }
+ break;
+
+ default:
+ ret = -1;
+ break;
+ }
+ return ret;
+err_kmalloc2:
+ kfree(cmem_pers_host_buf_info);
+ cmem_pers_host_buf_info = NULL;
+ return(-1);
+err_kmalloc4:
+ kfree(cmem_dyn_host_buf_info);
+ cmem_dyn_host_buf_info = NULL;
+ return(-1);
+
+
+}
+
+/**
+ * ti667x_ep_pcie_mmap() - Provide userspace mapping for specified kernel memory
+ * @filp: File private data - ignored
+ * @vma: User virtual memory area to map to
+ *
+ * At present, only allows mapping BAR1 & BAR2 spaces. It is assumed that these
+ * BARs are internally translated to access ti667x L2SRAM and DDR RAM
+ * respectively (application can ensure this using TI667X_PCI_SET_BAR_WINDOW
+ * ioctl to setup proper translation on ti667x EP).
+ *
+ * Note that the application has to get the physical BAR address as assigned by
+ * the host code. One way to achieve this is to use ioctl
+ * TI667X_PCI_GET_BAR_INFO.
+ */
+int cmem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int ret = -EINVAL;
+ unsigned long sz = vma->vm_end - vma->vm_start;
+ unsigned long long addr = (unsigned long long)vma->vm_pgoff << PAGE_SHIFT;
+
+ sscanf(filp->f_path.dentry->d_name.name, CMEM_MODFILE);
+
+ dev_info(cmem_dev, "Mapping %#lx bytes from address %#llx\n",
+ sz, addr);
+
+// vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ ret = remap_pfn_range(vma, vma->vm_start,
+ vma->vm_pgoff,
+ sz, vma->vm_page_prot);
+
+ return ret;
+}
+
+
+unsigned int cmem_poll(struct file *filp, poll_table *wait)
+{
+ return(0);
+}
+
+/**
+* cmem_fops - Declares supported file access functions
+*/
+static const struct file_operations cmem_fops = {
+ .owner = THIS_MODULE,
+ .mmap = cmem_mmap,
+ .unlocked_ioctl = cmem_ioctl,
+ .poll = cmem_poll
+};
+#if 0
+/**
+* cmem_err_cleanup() - Error Cleanup DMA_MEm driver
+*
+*/
+
+static void cmem_err_cleanup(int ti667_dev_temp_num)
+{
+ int minor;
+ minor=0;
+ {
+ device_destroy(cmem_class, MKDEV(cmem_major,minor));
+ }
+ class_destroy(cmem_class);
+ cdev_del(&cmem_cdev);
+ unregister_chrdev_region(cmem_dev_id, 1);
+
+#if PCI_ALLOC /* TODO : Remove once make driver independant of PCI */
+ {
+ pci_dev_put(ti667x_pci_dev[minor]);
+ }
+#endif
+}
+#endif
+/**
+* cmem_init() - Initialize DMA Buffers device
+*
+* Initialize DMA buffers device.
+*/
+
+static int __init cmem_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&cmem_dev_id, 0, 1, CMEM_DRVNAME);
+ if (ret) {
+ pr_err(CMEM_DRVNAME ": could not allocate the character driver");
+ return -1;
+ }
+
+ cmem_major = MAJOR(cmem_dev_id);
+ cmem_minor = 0;
+
+ cmem_class = class_create(THIS_MODULE, CMEM_DRVNAME);
+ if (!cmem_class) {
+ unregister_chrdev_region(cmem_dev_id, 1);
+ pr_err(CMEM_DRVNAME ": Failed to add device to sys fs\n");
+ goto err_class_create ;
+ }
+ cdev_init(&cmem_cdev, &cmem_fops);
+ cmem_cdev.owner = THIS_MODULE;
+ cmem_cdev.ops = &cmem_fops;
+
+ ret = cdev_add(&cmem_cdev, MKDEV(cmem_major, cmem_minor), 1);
+ if (ret) {
+ pr_err(CMEM_DRVNAME ": Failed creation of node\n");
+ goto err_dev_add;
+ }
+
+ pr_info(CMEM_DRVNAME ": Major %d Minor %d assigned\n",
+ cmem_major, cmem_minor);
+
+
+ cmem_dev = device_create(cmem_class, NULL, MKDEV(cmem_major, cmem_minor),
+ NULL, CMEM_MODFILE);
+ if(cmem_dev < 0) {
+ pr_info(CMEM_DRVNAME ": Error creating device \n");
+ goto err_dev_create;
+ }
+
+ dev_info(cmem_dev, "Added device to the sys file system\n");
+ cmem_pers_host_buf_info = NULL;
+ cmem_dyn_host_buf_info = NULL;
+
+#ifdef PCI_ALLOC
+ if(0 != ti667x_ep_find_device()) {
+ pr_err(TI667X_PCIE_DRVNAME ": Unable to find PCI device\n");
+ goto pci_error_cleanup ;
+ }
+#else
+ persistent_mem_area.size = RESERVED_CONSISTENT_MEM_SIZE;
+ persistent_mem_area.start_addr = RESERVED_CONSISTENT_MEM_START_ADDR;
+
+ if(persistent_mem_area.start_addr ==0)
+ {
+ pr_info(CMEM_DRVNAME "Request mem region failed: persistent \n");
+ goto persistent_reserve_fail_cleanup;
+ }
+ pr_info(CMEM_DRVNAME "Memory start Addr : %llu Size: %lu \n",
+ persistent_mem_area.start_addr, (unsigned long)persistent_mem_area.size);
+ dynamic_mem_area.size = RESERVED_DYNAMIC_MEM_SIZE;
+
+ dynamic_mem_area.start_addr = RESERVED_DYNAMIC_MEM_START_ADDR;
+ if( dynamic_mem_area.start_addr== 0)
+ {
+ pr_info(CMEM_DRVNAME "Request mem region failed: dynamic \n");
+ goto dynamic_reserve_fail_cleanup;
+ }
+ pr_info(CMEM_DRVNAME "Dynamic Memory start Addr : %llu Size: %lu\n",dynamic_mem_area.start_addr,
+ (unsigned long)dynamic_mem_area.size);
+ host_buf_alloc_ptr = (persistent_mem_area.start_addr);
+ host_dyn_buf_alloc_ptr = (dynamic_mem_area.start_addr);
+#endif
+
+ spin_lock_init(&l_lock);
+ init_waitqueue_head(&l_read_wait);
+ return 0 ;
+
+#ifdef PCI_ALLOC /* TODO : Remove once make driver independant of PCI */
+pci_error_cleanup:
+#else
+
+dynamic_reserve_fail_cleanup:
+
+persistent_reserve_fail_cleanup:
+#endif
+
+err_dev_create:
+ cdev_del(&cmem_cdev);
+
+err_dev_add:
+ class_destroy(cmem_class);
+err_class_create:
+ unregister_chrdev_region(cmem_dev_id, 1);
+
+ return(-1);
+}
+module_init(cmem_init);
+
+/**
+* cmem_cleanup() - Perform cleanups before module unload
+*/
+static void __exit cmem_cleanup(void)
+{
+
+ if(cmem_pers_host_buf_info)
+ {
+#ifdef PCI_ALLOC
+ int i;
+ for(i =0; i < cmem_pers_host_buf_info->num_buffers; i++) {
+ if(cmem_pers_host_buf_info->buf_info[i].dmaAddr != 0) {
+ dma_free_coherent(&ti667x_pci_dev[0]->dev,
+ cmem_pers_host_buf_info->buf_info[i].length,
+ cmem_pers_host_buf_info->buf_info[i].virtAddr,
+ cmem_pers_host_buf_info->buf_info[i].dmaAddr);
+ dev_info(&ti667x_pci_dev[0]->dev,
+ "Freed Host memory %d: Base Address: 0x%x Size : 0x%x\n",
+ i, (unsigned int)cmem_pers_host_buf_info->buf_info[i].dmaAddr,
+ (unsigned int)cmem_pers_host_buf_info->buf_info[i].length);
+ }
+ }
+#else
+ host_buf_alloc_ptr = (persistent_mem_area.start_addr);
+#endif
+ if(cmem_pers_host_buf_info->buf_info)
+ kfree(cmem_pers_host_buf_info->buf_info);
+ kfree(cmem_pers_host_buf_info);
+ }
+ if(cmem_dyn_host_buf_info)
+ {
+#ifdef PCI_ALLOC
+ int i;
+ for(i =0; i < cmem_dyn_host_buf_info->num_buffers; i++) {
+ if(cmem_dyn_host_buf_info->buf_info[i].dmaAddr != 0) {
+ dma_free_coherent(&ti667x_pci_dev[0]->dev,
+ cmem_dyn_host_buf_info->buf_info[i].length,
+ cmem_dyn_host_buf_info->buf_info[i].virtAddr,
+ cmem_dyn_host_buf_info->buf_info[i].dmaAddr);
+ dev_info(&ti667x_pci_dev[0]->dev,
+ "Freed Host memory %d: Base Address: 0x%x Size : 0x%x\n",
+ i, (unsigned int)cmem_dyn_host_buf_info->buf_info[i].dmaAddr,
+ (unsigned int)cmem_dyn_host_buf_info->buf_info[i].length);
+ }
+ }
+#else
+ host_dyn_buf_alloc_ptr = (dynamic_mem_area.start_addr);
+#endif
+ if(cmem_dyn_host_buf_info->buf_info)
+ kfree(cmem_dyn_host_buf_info->buf_info);
+ kfree(cmem_dyn_host_buf_info);
+ }
+ /* Free memory reserved */
+ device_destroy(cmem_class, MKDEV(cmem_major,0));
+
+ class_destroy(cmem_class);
+ cdev_del(&cmem_cdev);
+ unregister_chrdev_region(cmem_dev_id, 1);
+#ifdef PCI_ALLOC /* TODO : Remove once make driver independant of PCI */
+ {
+ pci_dev_put(ti667x_pci_dev[0]);
+ }
+ pr_info(TI667X_PCIE_DRVNAME ": Finished put device \n");
+#endif
+ pr_info(CMEM_DRVNAME "Module removed \n");
+}
+module_exit(cmem_cleanup);
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/cmem/cmem.h b/cmem/cmem.h
new file mode 100644
index 0000000..921362c
--- /dev/null
+++ b/cmem/cmem.h
@@ -0,0 +1,92 @@
+/*
+ *
+ * Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+*/
+
+
+#ifndef __CMEM_H__
+#define __CMEM_H__
+
+#define CMEM_DRVNAME "cmem"
+#define CMEM_MODFILE "cmem"
+
+#define CMEM_DRIVER_SIGNATURE "/dev/"CMEM_MODFILE
+
+#ifdef __KERNEL__
+
+#endif /* __KERNEL__ */
+/* Maximum number of buffers allocated per API call*/
+#define CMEM_MAX_BUF_PER_ALLOC 64
+/**
+* ti816x_bar_info - PCI Base Address Register information
+* @num: BAR register index - 0 to 5
+* @addr: For 'SET' operations, contains ti816x internal address to translate
+* @size: Size allocated for this BAR (only usd for GET operation)
+* this BAR access to. For 'GET'' operations, contains the (host) physical
+* address assigned to this BAR.
+*/
+/* Basic information about host buffer accessible by DSP through PCIE */
+typedef struct _cmem_host_buf_entry_t {
+ uint64_t dmaAddr; /* PCIe address */
+ uint8_t *virtAddr; /* Host Virtual address */
+ uint32_t length; /* Length of host buffer */
+} cmem_host_buf_entry_t;
+
+
+/* List of Buffers */
+typedef struct _cmem_host_buf_info_t {
+ unsigned int num_buffers; /* Number of host buffers */
+ unsigned int type; /* memory type 0; Persistent; 1; Dynamic */
+ cmem_host_buf_entry_t *buf_info;
+} cmem_host_buf_info_t;
+
+/* List of Buffers */
+typedef struct _cmem_ioctl_host_buf_info_t {
+ unsigned int num_buffers; /* Number of host buffers */
+ unsigned int type; /* memory type 0; Persistent; 1; Dynamic */
+ cmem_host_buf_entry_t buf_info[CMEM_MAX_BUF_PER_ALLOC];
+} cmem_ioctl_host_buf_info_t;
+
+/** Parameters used for calling IOCTL */
+typedef struct _cmem_ioctl_t {
+ cmem_ioctl_host_buf_info_t host_buf_info;
+} cmem_ioctl_t;
+
+/* IOCTLs defined for the application as well as driver */
+#define CMEM_IOCTL_ALLOC_HOST_BUFFERS _IOWR('P', 1, unsigned int)
+#define CMEM_IOCTL_GET_HOST_BUF_INFO _IOWR('P', 2, unsigned int)
+#define CMEM_IOCTL_FREE_HOST_BUFFERS _IOWR('P', 3, unsigned int)
+
+#endif
diff --git a/cmem/cmemcfg.h b/cmem/cmemcfg.h
new file mode 100644
index 0000000..e1ad1da
--- /dev/null
+++ b/cmem/cmemcfg.h
@@ -0,0 +1,55 @@
+/*
+ *
+ * Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+*/
+
+#ifndef __CMEM_CFG_H__
+#define __CMEM_CFG_H__
+
+#define CMEM_CFG_USE_DMA_COHERENT_ALLOC
+
+
+#define CMEM_CFG_RESERVED_MEM_START_ADDRESS 0x1df800000
+#define CMEM_CFG_RESERVED_MEM_SIZE 0x20800000
+
+#define RESERVED_CONSISTENT_MEM_START_ADDR CMEM_CFG_RESERVED_MEM_START_ADDRESS
+#define RESERVED_CONSISTENT_MEM_SIZE 0x800000l
+
+#define RESERVED_DYNAMIC_MEM_START_ADDR (RESERVED_CONSISTENT_MEM_START_ADDR+RESERVED_CONSISTENT_MEM_SIZE)
+#define RESERVED_DYNAMIC_MEM_SIZE (CMEM_CFG_RESERVED_MEM_SIZE-RESERVED_CONSISTENT_MEM_SIZE)
+
+
+
+#endif /* __CMEM_CFG_H__ */
diff --git a/cmem/load.sh b/cmem/load.sh
new file mode 100755
index 0000000..41efc58
--- /dev/null
+++ b/cmem/load.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+insmod cmem_dev.ko
+
+filelist=$(ls /dev | grep cmem)
+for filename in $filelist
+do
+ chmod 666 "/dev/$filename"
+done
+
+
diff --git a/cmem/unload.sh b/cmem/unload.sh
new file mode 100755
index 0000000..20cb1ab
--- /dev/null
+++ b/cmem/unload.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+rmmod cmem_dev.ko
+
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..8785fd5
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,15 @@
+# Remove intermediate files if there is an error
+.DELETE_ON_ERROR:
+
+%.wiki : %.tex
+ pandoc -f latex -t mediawiki $< -o $@
+
+# http://stackoverflow.com/questions/12343428/latex-reference-and-makefile
+%.pdf %.aux %.idx: %.tex
+ pdflatex -file-line-error -interaction=batchmode $<
+ pdflatex -file-line-error -interaction=batchmode $<
+
+all: opencl_readme.pdf opencl_readme.wiki
+
+clean:
+ rm -f *.aux *.log *.toc *.out
diff --git a/doc/opencl-1.1.pdf b/doc/opencl-1.1.pdf
new file mode 100644
index 0000000..7dd4333
--- /dev/null
+++ b/doc/opencl-1.1.pdf
Binary files differ
diff --git a/doc/opencl-cplusplus-1.1.pdf b/doc/opencl-cplusplus-1.1.pdf
new file mode 100644
index 0000000..603b90c
--- /dev/null
+++ b/doc/opencl-cplusplus-1.1.pdf
Binary files differ
diff --git a/doc/opencl_readme.pdf b/doc/opencl_readme.pdf
new file mode 100644
index 0000000..2f39404
--- /dev/null
+++ b/doc/opencl_readme.pdf
Binary files differ
diff --git a/doc/opencl_readme.tex b/doc/opencl_readme.tex
new file mode 100644
index 0000000..9dec4bd
--- /dev/null
+++ b/doc/opencl_readme.tex
@@ -0,0 +1,668 @@
+%-------------------------------------------------------------------------
+% Preamble
+%-------------------------------------------------------------------------
+\documentclass[10pt]{article}
+\usepackage{palatino}
+\usepackage{listings}
+\usepackage{color}
+\usepackage{textcomp}
+%\usepackage{underscore}
+\usepackage{hyperref}
+\usepackage{amsfonts, amsmath, amssymb}
+\usepackage[margin=1.25in]{geometry}
+\usepackage{lipsum}
+\usepackage{graphicx}
+\usepackage{fancyhdr}
+\usepackage{parskip}
+\usepackage{courier}
+\setlength{\parskip}{12pt}
+
+\pagestyle{fancy}
+\setlength\headheight{32pt}
+
+\fancyhead{}
+\renewcommand{\headrulewidth}{0pt}
+\rhead{\it OpenCL 1.1 Product Version 0.8.2 Readme \\ \color{lightgray}{\rule{6in}{2pt}}}
+
+%\setlength\parindent{0pt}
+
+%-------------------------------------------------------------------------
+% Format the footer for each page
+%-------------------------------------------------------------------------
+\renewcommand{\footrulewidth}{1pt}
+\fancyfoot{}
+\fancyfoot[L]{}
+\fancyfoot[R]{\small{\thepage}}
+
+\definecolor{listinggray}{gray}{0.9}
+\definecolor{lbcolor}{rgb}{0.9,0.9,0.9}
+\definecolor{lightgray}{rgb}{0.7,0.7,0.7}
+
+\lstset{
+ language=c,
+ keywordstyle=\bfseries\ttfamily\color[rgb]{0,0,1},
+ identifierstyle=\ttfamily,
+ commentstyle=\color[rgb]{0.133,0.545,0.133},
+ stringstyle=\ttfamily\color[rgb]{0.627,0.126,0.941},
+ showstringspaces=false,
+ basicstyle=\footnotesize,
+ numberstyle=\tiny,
+ numbers=left,
+ stepnumber=1,
+ numbersep=8pt,
+ tabsize=2,
+ breaklines=true,
+ prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},
+ breakatwhitespace=false,
+ aboveskip={1.5\baselineskip},
+ columns=fixed,
+ upquote=true,
+ extendedchars=true,
+ frame=single,
+ % backgroundcolor=\color{lbcolor},
+}
+
+\linespread{1.1}
+
+\renewcommand{\familydefault}{\sfdefault}
+\usepackage{helvet}
+
+\begin{document}
+
+
+%-------------------------------------------------------------------------
+% Title
+%-------------------------------------------------------------------------
+\begin{titlepage}
+
+\vspace {50mm}
+{\Large \bfseries OpenCL\textsuperscript{\texttrademark} 1.1 for 66AK2H}\\
+\rule{6in}{2pt}
+
+\begin{center}
+\vspace {15 mm}
+{ \Large \bfseries Product Version 0.8.1 Readme \\[3mm] }
+{ \large Publication Date: \today \\ }
+\end{center}
+
+\vfill
+
+\includegraphics[scale=0.4]{tiStk2cRgb.pdf}\\
+\vspace {2mm}
+\rule{6in}{2pt}
+{\small
+* Product is based on a published Khronos Specification, and is expected to
+pass the Khronos Conformance Testing Process. Current conformance status can
+be found at www.khronos.org/conformance.
+}
+\end{titlepage}
+
+\newpage
+
+%-------------------------------------------------------------------------
+% TOC
+%-------------------------------------------------------------------------
+\tableofcontents
+
+\newpage
+
+%-------------------------------------------------------------------------
+% Intro
+%-------------------------------------------------------------------------
+\section{Product Description}
+
+This product is an OpenCL 1.1 implementation. The OpenCL specification
+defines a platform model with a host and compute devices. For this
+implementation the host is a 4-core ARM Cortex-A15 running Linux. There are two
+compute devices:
+
+\begin{enumerate}
+\item The collection of 8 Texas Instruments' C66x DSP cores is
+exposed as one virtual accelerator compute device, and
+\item The collection of 4 ARM
+Cortex-A15 cores is exposed as a virtual CPU device. (note: The CPU device currently only
+supports native kernels)
+\end{enumerate}
+
+This OpenCL implementation is typically installed as part of the TI MCSDK-HPC product and environment variables and dependencies are handled automatically as part of the MCSDK-HPC installation. An installation section is still documented here for your knowledge and troubleshooting. See section \ref{installation} Installation and Dependencies as needed.
+
+\section{OpenCL Documentation}
+
+The OpenCL 1.1 specification and the 1.1 C++ bindings specification from
+Khronos are included in the \verb!$(TI_OCL_INSTALL)/doc! sub-directory of this
+installation.
+
+Additional OpenCL resources can be found on the web. Some useful links are provided
+below.
+
+\begin{itemize}
+\item The OpenCL 1.1 on-line manual pages can be found at:
+ \url{http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml}
+
+\item The Khronos OpenCL resources page also has links to good OpenCL
+reference material: \url{http://www.khronos.org/opencl/resources}
+\end{itemize}
+
+
+
+
+\section{TI OpenCL Extensions}
+
+This OpenCL implementation from Texas Instruments has been extended with a set
+of features beyond the OpenCL 1.1 specification. These features were added in
+order to better support the excution of code on the C66 DSP, to enable
+existing DSP libraries, and to better map to the 66AK2H hardware platform.
+
+\begin{enumerate}
+
+\item This OpenCL implementation supports the ability to call standard C code from OpenCL C kernels. This includes calling functions from existing C66 DSP libraries, such as the dsplib or mathlib. For examples of this capability please refer to the ccode example in section \ref{ccode} for an example of calling a C function you define, or the dsplib\_fft example in section \ref{dsplibfft} for an example calling a function in a library.
+
+\item Additionally the called standard C code can contain OpenMP pragmas. When using this feature the OpenCL C kernel containing the call to an OpenMP enabled C function must be submitted as a task (not an NDRangeKernel) and it must be submitted to an in-order OpenCL command queue (i.e not defined with the \verb!CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE! flag). In this scenario, OpenCL will dispatch the kernel to one compute unit of the DSP accelerator and the OpenMP runtime will manage distribution of tasks across the compute units. Please see examples vecadd\_openmp in section \ref{vecaddopenmp}, vecadd\_openmp\_t in section \ref{vecaddopenmpt} or openmpbench\_C\_v3 in section \ref{openmpbench}
+
+\item A printf capability has been added for kernels running on the DSP. OpenCL
+ C kernels or standard C code called from an OpenCL C kernel can call
+ printf. The string resulting from the printf will be transmitted to the host ARM and
+ displayed using a printf on the ARM side. This feature can be used to
+ assist in debug of your OpenCL kernels. Note that there is a performance
+ penalty in using printf from the DSPs, so it is not a feature that should
+ be used when evaluating DSP performance. This feature is not the OpenCL 1.2
+ printf which contains additional formatting codes for printing vector
+ types.
+
+\item A TI extension for allocating buffers out of MSMC memory has been added. Other
+ than the location on the DSP where the buffer will reside, this MSMC
+ defined buffer will act as a standard global buffer in all other ways. Example:
+
+\begin{verbatim}
+Buffer bufMsmc(context, CL_MEM_READ_ONLY|CL_MEM_USE_MSMC_TI, size);
+Buffer bufDdr (context, CL_MEM_READ_ONLY, size); }
+\end{verbatim}
+
+The matmpy example in section \ref{matmpy} illustrates the use of MSMC
+buffers. The platform example in section \ref{platform} will query the DSP
+device and report the amount of MSMC memory available for OpenCL use.
+
+MSMC stands for Multicore Shared Memory Controller and conaints is an
+on-chip memory shared across all ARM and DSP cores on the chip.
+
+\item The OpenCL C compiler for the C66 DSP supports the C66x standard C compiler set of intrinsic functions, with the exception of those intrinsics that accept or result in a 40 bit value. Please refer to the C6000 Compiler User's Guide for a list of these intrinsic functions.
+
+\item Additionally these non standard OpenCL C built-in functions are supported:
+
+\begin{verbatim}
+uint32_t __core_num (void);
+uint32_t __clock (void);
+uint64_t __clock64 (void);
+void __cycle_delay (uint64_t cyclesToDelay);
+void __mfence (void);
+\end{verbatim}
+
+ \verb!__core_num! returns [0-7] depending on which DSP core executes the
+ function.
+
+ \verb!__clock! return a 32-bit time-stamp value, subtracting two values returned
+ by \verb!__clock! gives the number of elapsed DSP cycles between the two.
+ This equates to the C66 device's TSCL register.
+
+ \verb!__clock64! return a 64-bit time-stamp value similar to \verb!__clock! but with more
+ granularity to avoid potential overflow of a 32 bit counter.
+ This equates to the C66 device's TSCH:TSCL register pair.
+
+ \verb!__cycle_delay! takes a specified number of cycles to delay and will busy
+ loop for that many cycles (approximately) before returning.
+
+ \verb!__mfence! is a memory fence for the C66x dsp. Under typical OpenCL use,
+ this will not be needed. However, when incorporating EDMA usage into
+ OpenCL C kernels, it may be needed.
+
+ Note: In standard C for C66 a uint32\_t is an unsigned int and a uint64\_t is an
+ unsigned long long. In OpenCL C for C66 a uint32\_t is an unsigned int and
+ a uint64\_t is an unsigned long.
+
+\item This OpenCL implementation also supports direct access to the 66AK2H
+EDMA system from the DSP and OpenCL C kernels. A wide range of EDMA
+constructs are supported. These include 1D to 1D, 1D to 2D, 2D to 1D, and chained
+transfers. The edmamgr example in section \ref{edmamgr} shows an example
+usage of this feature.
+
+\item An extended memory feature is also supported in this implementation of
+OpenCL. The C66 DSP is a 32-bit architecture and has a limit of 2GB of DDR
+that it can access at any given time. The 66AK2H platforms can support up to
+8GB of DDR3. To enable usage of DDRs greater than 2GB, this OpenCL
+implementation can use a hardware mapping feature to move windows over the 8GB
+DDR into the 32-bit DSP address space. Movement of these windows will occur
+at kernel start boundaries so two sequential kernels dispatched to the DSP
+device may actually operate on different 2GB areas within the 8GB DDR. The
+windows are not moved within a kernel. As a result of this feature, large
+buffers may be created and subsequently populated on the ARM side. However, a
+dispatched kernel may not access the entire buffer in one dispatch. Any given
+OpenCL Kernel will be limited to a total of 2GB of DDR access. The example
+vecadd\_mpax in section \ref{vecaddmpax} illustrates a process of defining a
+large buffer and then defining sub-buffers within the larger buffer and
+dispatching multiple OpenCL kernels to the DSP on these sub-buffers,
+cumulatively resulting in the entire large buffer being processed.
+
+\end{enumerate}
+
+
+\newpage
+\section{Limitations of this Beta OpenCL Implementation}
+This is a Beta version of this product. It is not a complete OpenCL
+implementation and has not successfully completed the Khronos conformance
+tests yet. Specifically, the following features are missing:
+
+\begin{itemize}
+\item This OpenCL implementation is not designed to allow multiple OpenCL enabled Linux processes
+to concurrently execute. Attempting to execute two OpenCL applications
+concurrently can leave the DSPs in an unknown state and may require a reboot.
+This will be resolved in a future OpenCL release.
+
+\item Dispatching OpenCL C kernels to the CPU device with clEnqueueTask or
+clEnqueueNDRangeKernel is not currently supported. The CPU is still exposed
+in the platform because you can submit native code to the CPU using
+clEnqueueNativeKernel. This is illustrated in the ooo and ooo\_map examples \ref{ooo} in
+this installation. All dispatch models are supported for the DSP (accelerator) device.
+
+\item The OpenCL API functions are complete for the DSP device, with the exception of
+argument passing to kernels. The implementation today will support at least
+19 arguments and as many as 29 arguments to kernels. The limit is variable
+based on the size of the arguments. Additionally, structures and vector types are
+not supported as arguments yet. Buffers of vector types are supported.
+
+\item Not all of the OpenCL C built-in functions are available yet. Missing are:
+
+\begin{itemize}
+ \item the convert\_type built-ins where saturation or explicit rounding
+ modes are specified from section 6.2.3 in the OpenCL 1.1 spec. The
+ convert\_type built-ins without saturation of explicit rounding are supported.
+ \item the atomic built-ins from section 6.11.11 in the OpenCL 1.1 spec
+ \item the shuffle and shuffle2 built-ins from section 6.11.12 in the OpenCL 1.1 spec.
+ \item the following math built-ins from section 6.11.2 in the OpenCL 1.1 spec:
+ cbrt, copysign, erfc, erf, expm1, fdim, fma, fmax, fmin, fract, frexp,
+ hypot, ilogb, ldexp, lgamma, lgamma\_r, log1b, logb, maxmag, minmag,
+ modf, nan, pown, powr, remainder, remquo, rint, rootn, round,
+ sincos, tgamma, trunc
+\end{itemize}
+
+\item The half (16bit) floating point format is not supported.
+
+\item Support for images and samplers is optional for non GPU devices per the
+OpenCL 1.1 spec and they are not supported by the DSP device in this
+implementation.
+
+\item The device info query for frequency on the CPU device is not accurate. It
+currently returns a fixed value of 1.0 Ghz.
+
+\item The OpenCL C built-in functions are not yet optimized for the C66 dsp
+device.
+
+\item The C66x DSP compiler used to compile OpenCL C kernels is not yet optimized for wide vector types. The resultant code can be inefficient and the compile time
+can be long on some code samples using 8 or 16 wide vectors. It is recommended that vector widths over 4 (2 for doubles and longs) be avoided in this release.
+
+\end{itemize}
+
+\section{Notes}
+On-line compilations of OpenCL C code are cached on the system. If you
+ run an OpenCL application that on-line compiles some OpenCL C code,
+ the resultant binaries are cached on the system and the next time
+ you run the OpenCL application, the compilation step is skipped and
+ the cached binaries are used. The caching only uses the OpenCL C
+ code and the compile options as a hash, so an example where the
+ OpenCL C code is calling a C function in a linked object file or
+ library and the object file or library is modified will result in an
+ execution of the OpenCL C linked against the older version of the
+ object. In this case you will need to clear the OpenCL C compile
+ cache, which can be accomplished with the command
+
+\verb!rm -f /tmp/opencl*!.
+
+Additionally you can disable this caching behavior by setting the
+\verb!TI_OCL_CACHE_KERNELS_OFF! enviroment variable. See the Enviroment
+Variable section \ref{envvar} for more details.
+
+\newpage
+\section{Examples}
+There are several OpenCL examples shipped with the product. They are located
+in the 'examples' directory within the OpenCL package.
+
+The examples can be cross-compiled in an X86 development environment, or
+compiled native on the ARM A15, depending on the availability of native g++ or
+cross-compiled arm-linux-gnueabihf-g++ tool sets.
+
+The example makefiles are setup to cross-compile by default and
+assume an ARM cross-compile environment has been installed. If the cross compiler is not installed, execute the following command to install it:
+
+\texttt{sudo apt-get install g++-4.6-arm-linux-gnueabihf}
+
+The environment variables defined in the Installation section
+/ref{installation} are required for
+correct operation of all examples below.
+
+\subsection{Building and Running the Examples}
+From your X86 system: All the examples can be built at one time by invoking
+'make' from the 'examples' directory.
+
+Individual examples can be built by navigating to the desired directory and also
+issuing \texttt{make [cross]}.
+
+Once the example is built and the resulting executable is available on the
+Hawking EVM file system it can be invoked from within a Hawking EVM xterm
+or console window.
+
+
+\subsection{Platform Example}\label{platform}
+The platform example uses the OpenCL C++ bindings to discover key platform and
+device information from the OpenCL implementation and print it to the screen.
+
+
+\subsection{Simple Example}
+This example simply illustrates the minimum steps needed to dispatch a kernel
+to a DSP device and read a buffer of data back.
+
+
+\subsection{Mandelbrot/Mandelbrot\_native Examples}\label{mandelbrot}
+The Mandelbrot example is an OpenCL demo that uses OpenCL to generate the
+pixels of a Mandelbrot set image. This example also uses the C++ OpenCL
+binding. The OpenCL kernels are repeatedly called generating images that are
+zoomed in from the previous image. This repeats until the zoom factor reaches
+1E15.
+
+This example illustrates several key OpenCL features:
+\begin{itemize}
+\item OpenCL queues tied to potentially multiple DSP devices and a
+ dispatch structure that allows the DSPs to cooperatively generate pixel
+ data,
+\item The event wait feature of OpenCL,
+\item The division of one time setup of OpenCL to the repetitive
+ en-queuing of kernels, and
+\item The ease in which kernels can be shifted from one device type to another.
+\end{itemize}
+
+
+The 'mandelbrot\_native' example is non-OpenCL native implementation (no
+dispatch to the DSPs) that can be used for comparison purposes. It uses OpenMP
+for dispatch to each ARM core.
+
+Note: The display of the resulting Mandelbrot images is disabled when this example is cross compiled. It relies on libsdl capability which is not currently supported on the default Linux file-system included in the MCSDK.
+
+
+\subsection{Ccode Example}\label{ccode}
+This example illustrates the TI extension to OpenCL that allows OpenCL C code
+to call standard C code that has been compiled off-line into an object file or
+static library. This mechanism can be used to allow optimized C or C callable
+assembly routines to be called from OpenCL C code. It can also be used to
+essentially dispatch a standard C function, by wrapping it with an OpenCL C
+wrapper. Calling C++ routines from OpenCL C is not yet supported. You should
+also ensure that the standard C function and the call tree resulting from the
+standard C function do not allocate device memory, change the cache structure,
+or use any resources already being used by the OpenCL runtime.
+
+
+\subsection{Matmpy Example}\label{matmpy}
+This example performs a 1K x 1K matrix multiply using both OpenCL and a
+native ARM OpenMP implementation (GCC libgomp). The output is the execution
+time for each approach ( OpenCL dispatch to the DSP vs. OpenMP dispatching to the 4 ARM A15s ).
+
+This example illustrates the use of local buffers, gloal buffers mapped to MSMC using the TI MSMC entension and
+async\_workgroup\_strided\_copy for loading a global buffer into the local
+buffer. The async\_workgroup\_strided\_copy built-in function currently just
+performs a memcpy and therefore the data movement and compute do not overlap, but the
+structure of the code is illustrative of how this will work in a later release where the async\_workgroup\_strided\_copy will use the device's EDMA capability to offload the data movement from the DSP device.
+
+\subsection{Offline Example}
+This example performs a vector addition by pre-compiling an OpenCL kernel into a
+device executable file. The OpenCL program reads the file containing the
+pre-compiled kernel in and uses it directly. If you use offline compilation
+to generate a .out file containing the OpenCL C program and you subsequently
+move the executable, you will either need to move he .out as well or the
+OpenCL application will need to specificy a non relative path to the .out
+file.
+
+\subsection{Offline\_embed Example}
+Similar to the offline example, but instead of compiling the kernel into a
+binary file, an embeddable header file is created which can be compiled
+directly into the host application.
+
+
+\subsection{Vecadd\_openmp Example}\label{vecaddopenmp}
+This is an OpenCL + OpenMP example. OpenCL program is running on the host,
+managing data transfers, and dispatching an OpenCL wrapper kernel to the
+device. The OpenCL wrapper kernel will use the ccode mode (see ccode example)
+to call the C function that has been compiled with OpenMP options (--omp).
+To facilitate OpenMP mode, the OpenCL wrapper kernel needs to be dispatched as
+an OpenCL Task to an In-Order OpenCL Queue.
+
+
+\subsection{Vecadd\_openmp\_t Example}\label{vecaddopenmpt}
+This is another OpenCL + OpenMP example, similar to vecadd\_openmp. The main
+difference w.r.t vecadd\_openmp is that this example uses OpenMP tasks within
+the OpenMP parallel region to distribute computation across the DSP cores.
+
+
+\subsection{Openmpbench\_c\_v3 Example}\label{openmpbench}
+This OpenCL + OpenMP example is derived from EPCC OpenMP microbenchmarks,
+v3.0. The syncbench test was modified to dispatch using OpenCL.
+
+
+\subsection{Vecadd Example}
+The same functionality as the vecadd\_openmp example, but expressed fully as an
+OpenCL application without OpenMP. Included for comparison purposes.
+
+
+\subsection{Vecadd\_mpax Example}\label{vecaddmpax}
+The same functionality as the vecadd example, but with extended buffers.
+The example iteratively traverses smaller chunks (sub-buffers) of large buffers.
+During each iteration, the smaller chunks are mapped/unmapped for read/write.
+The sub-buffers are then passed to the kernels for processing. This example
+could also be converted to use a pipelined scheme where different iterations of
+CPU computation and device computation are overlapped.
+
+NOTE: The size of the buffers in the example (determined by the variable
+ 'NumElements') is dependent on the available CMEM block size.
+ Currently this example is configured to use buffers sizes for memory
+ configurations that can support ~1.5 GB total buffer size. The example
+ can be modified to use more (or less) based on the platform memory
+ configuration.
+
+\subsection{Vecadd\_mpax\_openmp Example}\label{vecaddmpaxopenmp}
+Similar to vecadd\_mpax example, but used OpenMP to perform the parallelization
+and the computation. This example also illustrates that printf() could be used
+in OpenMP C code for debugging purpose.
+
+\subsection{Dsplib\_fft Example}\label{dsplibfft}
+An example to compute FFT's using a routine from the dsplib library. This
+illustrates Calling a standard C library function from an OpenCL kernel.
+
+\subsection{Ooo and Ooo\_map Examples}\label{ooo}
+This Application illustrates several features of OpenCL.
+\begin{itemize}
+\item Using a combination of In-Order and Out-Of-Order queues
+\item Using native kernels on the CPU
+\item Using events to manage dependencies among the tasks to be executed.
+A JPEG in this directory illustrates the dependence graph being enforced in the
+application using OpenCL events.
+\end{itemize}
+
+The Ooo\_Map version additionally illustrates the use of OpenCL map and unmap
+operations for accessing shared memory between a host and a device. The
+Map/Unmap protocol can be used instead of read/write protocol on shared memory
+platforms.
+
+\subsection{Null Example}
+This application is intended to report the time overhead that OpenCL requires
+to submit and dispatch a kernel. A null(empty) kernel is created and
+dispatched so that the OpenCL profiling times queried from the OpenCL events reflects only the
+OpenCL overhead
+necessary to submit and execute the kernel on the device. This overhead is for the roundtrip for a single kernel dispath. In practice, when multiple tasks are being enqueued, this overhead is pipelined with execution and can approach zero.
+
+\subsection{Edmamgr Example}\label{edmamgr}
+This application illustrates how to use the edmamgr api to asynchronously move
+data around the DSP memory hierarchy from OpenCL C kernels. The edmamgr.h header file in this directory enumerates the APIs available from the edmamgr package
+
+\newpage
+\section{Environment Variables}\label{envvar}
+
+\begin{tabular}{l p{11cm} }
+\textbf{TI\_OCL\_INSTALL} &
+Must be set to the top level directory, where your OpenCL installations resides.\\
+\\
+\textbf{TI\_OCL\_CGT\_INSTALL} &
+Must be set to the top level directory, where your C66 DSP compiler tools installation resides.\\
+\\
+\textbf{TI\_OCL\_KEEP\_FILES} &
+When OpenCL C kernels are compiled for DSPs, they are
+compiled to a binary .out file in the /tmp sub-directory.
+They are then subsequently available for download to the
+DSPs for running. The process of compiling generates
+several intermediate files for each source file.
+OpenCL typically removes these temporary files. However,
+it can sometimes be useful to inspect these files. This
+environment variable can be set to instruct the runtime to
+leave the temporary files in /tmp. This can be useful to
+inspect the assembly file associated with the out file, to
+see how well your code was optimized.\\
+\\
+\textbf{TI\_OCL\_DEBUG\_KERNEL} &
+The TI IDE and debugger, Code Composer Studio (CCS) is not
+required for running OpenCL applications on the 66AK2H,
+but if you do have CCS installed and and emulator
+connected, you can set this environment
+variable to enable assembly statement level debug of your
+kernel. When set, this environment variable will instruct
+the OpenCL runtime to pause before dispatch of a kernel.
+While paused the runtime will display data to the user
+indicating that a kernel dispatch is pending. It will
+instruct the user to connect to the board through an
+emulator and will display the appropriate break-point
+address to use for the start of the kernel code. Debug
+capability has not been a focus for this beta release and
+will definitely improve in later releases. Setting up the
+emulator and CCS is outside the scope of this Readme. If
+you do have those products, consult the documentation
+specific to those products.\\
+\\
+\textbf{TI\_OCL\_CACHE\_KERNELS\_OFF} &
+This prevents the caching of OpenCL C Kernel
+compiles. This can be useful if your have OpenCL C
+kernels that call standard C from object code and you
+are actively modifying the standard C code. The
+standard C code is not seen by the caching algorithm
+and if you are actively modifying the C code, you
+may not pick up your modifications if a cached
+version of the OpenCL C kernel is being used.\\
+\end{tabular}
+
+
+\newpage
+\section{Installation and Dependencies}\label{installation}
+
+This version of OpenCL is dependent on other packages for proper
+ operation. If OpenCL was installed as part of the TI MCSDK-HPC product, it is likely that
+ these dependencies are resolved already. However, they are listed here for
+ your knowledge and as a first step in any troubleshooting.
+
+\subsection{CMEM module}
+ This module allows for contiguous memory allocation of physical memory on
+ the device.
+
+ After executing the Linux command \texttt{lsmod} you should see a cmemk module
+ listed. If it is not listed, then it will need to be installed for proper
+ OpenCL operation. Use the Linux modprobe command to install the cmem module.
+ The actual parameters used in the command will differ depending on your
+ DDR3 memory space partition. The following command and parameter set will
+ likely get you started, however.
+
+\begin{verbatim}
+sudo modprobe cmemk phys_start=0x823000000 phys_end=0x880000000
+ pools=1x1560281088 phys_start_1=0x0c040000 phys_end_1=0x0c500000
+ allowOverlap=1
+\end{verbatim}
+
+ Additionally the /dev/cmem device should have appropriate permissions.
+ Execute the command \verb!ls -l /dev/cmem! and ensure that read and write
+ permission for user, group and other are set. If not execute the command
+
+\texttt{sudo chmod 666 /dev/cmem }
+
+ It is also important that a minimum of 1.5GB be reserved from Linux for
+ proper cmem and OpenCL operation. Setting the uboot environment variable
+ mem\_reserve to at least 1536M will ensure this requirement.
+
+
+\subsection{MPM}
+The MPM package contains a client and server that allows the DSPs on the device
+to be loaded and run from Linux user space. To ensure that MPM is active,
+execute the Linux command
+\texttt{ps -ef | grep mpm}
+and your should see an instance
+of mpmsrv running. If your do not see that, then you can start the service by
+running:
+
+\texttt{/usr/sbin/mpmsrv}.
+
+If /usr/sbin/mpmsrv does not exist on your file-system refer to the MCSDK-HPC
+setup for installation.
+
+\subsection{MPM-TRANSPORT}
+The MPM-TRANSPORT package allows the A15 to read/write the shared memory from Linux user
+space. For proper operation, the devices \texttt{/dev/dsp*} need
+read/write permission for the user. To ensure that the permissions are
+correct, execute the Linux command \texttt{ls -l /dev/dsp*} and ensure that read/write permissions
+are set for user, group and other. If the permissions are not correct, you
+can set them with the command:
+
+\texttt{sudo chmod 666 /dev/dsp*}
+
+\subsection{Required Environment Variables}
+
+The OpenCL package depends on a few environment variables for proper
+execution.
+
+\textbf {Note:} Use EXPORT (bash) or SETENV (csh) to set these as environment variables.
+
+\begin{tabular}{l p{13cm} }
+\textbf{TI\_OCL\_INSTALL} & the location of the OpenCL package relative to the ARM A15\\
+\textbf{TI\_OCL\_CGT\_INSTALL} & the location of the ARM hosted C66 DSP compiler tools\\
+\textbf{LD\_LIBRARY\_PATH} & \verb!$TI_OCL_INSTALL/lib!\\
+\textbf{PATH} & \verb!$TI_OCL_INSTALL/bin/arm;$TI_OCL_CGT_INSTALL/bin!\\
+\end{tabular}
+
+Additionally if you are cross compiling OpenCL applications for the ARM target from an x86 Ubuntu
+machine, on that machine you will also need:
+
+\begin{tabular}{l p{13cm} }
+\textbf{TI\_OCL\_INSTALL} & location of the OpenCL package relative to the X86\\
+\textbf{TI\_OCL\_CGT\_INSTALL} & location of the x86 hosted C66 DSP compiler tools\\
+\textbf{PATH} & \verb!$TI_OCL_INSTALL/bin/x86;$TI_OCL_CGT_INSTALL/bin!\\
+\textbf{TARGET\_ROOTDIR} & location of Hawking EVM file system mount point on
+the x86 system\\
+\end{tabular}
+
+ For platforms were native builds are possible, the cross compiling setup
+ is not required, but is still supported for faster compilation.
+
+ The script hawking\_env.sh is included in the examples directory of this
+ installation. It illustrates the definition of these environment variables for
+ both the ARM runtime side and a potential X86 cross compiling side.
+
+\subsection{Additional package dependencies}
+
+ There are a few other packages that OpenCL depends on. These can be
+ installed using apt-get on the machine on which you will compile (either
+ cross or native) the OpenCL application.
+
+ \verb!sudo apt-get install binutils-dev mesa-common-dev libboost1.46-dev!\\
+ \verb! libsqlite3-dev libffi6 zlib1g!
+
+ For platforms that support Simple DirectMedia Layer the package
+ libsdl1.2-dev can also be installed in order to see the mandelbrot display for
+ the mandelbrot example. See section \ref{mandelbrot} for information on the
+ mandelbrot example.
+
+ \verb!sudo apt-get install libsdl1.2-dev!
+
+ Additionally, this OpenCL implementation requires the TI C6000 Compiler
+ product for proper execution. IT can be obtained from
+ \url{https://www-a.ti.com/downloads/sds_support/TICodegenerationTools/hpcc6xcgt.htm}.
+
+\end{document}
diff --git a/doc/tiStk2cRgb.pdf b/doc/tiStk2cRgb.pdf
new file mode 100644
index 0000000..8087da7
--- /dev/null
+++ b/doc/tiStk2cRgb.pdf
Binary files differ
diff --git a/include/CL/cl.h b/include/CL/cl.h
new file mode 100644
index 0000000..ff5a0d5
--- /dev/null
+++ b/include/CL/cl.h
@@ -0,0 +1,998 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11985 $ on $Date: 2010-07-15 11:16:06 -0700 (Thu, 15 Jul 2010) $ */
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id * cl_platform_id;
+typedef struct _cl_device_id * cl_device_id;
+typedef struct _cl_context * cl_context;
+typedef struct _cl_command_queue * cl_command_queue;
+typedef struct _cl_mem * cl_mem;
+typedef struct _cl_program * cl_program;
+typedef struct _cl_kernel * cl_kernel;
+typedef struct _cl_event * cl_event;
+typedef struct _cl_sampler * cl_sampler;
+
+typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong cl_bitfield;
+typedef cl_bitfield cl_device_type;
+typedef cl_uint cl_platform_info;
+typedef cl_uint cl_device_info;
+typedef cl_bitfield cl_device_fp_config;
+typedef cl_uint cl_device_mem_cache_type;
+typedef cl_uint cl_device_local_mem_type;
+typedef cl_bitfield cl_device_exec_capabilities;
+typedef cl_bitfield cl_command_queue_properties;
+
+typedef intptr_t cl_context_properties;
+typedef cl_uint cl_context_info;
+typedef cl_uint cl_command_queue_info;
+typedef cl_uint cl_channel_order;
+typedef cl_uint cl_channel_type;
+typedef cl_bitfield cl_mem_flags;
+typedef cl_uint cl_mem_object_type;
+typedef cl_uint cl_mem_info;
+typedef cl_uint cl_image_info;
+typedef cl_uint cl_buffer_create_type;
+typedef cl_uint cl_addressing_mode;
+typedef cl_uint cl_filter_mode;
+typedef cl_uint cl_sampler_info;
+typedef cl_bitfield cl_map_flags;
+typedef cl_uint cl_program_info;
+typedef cl_uint cl_program_build_info;
+typedef cl_int cl_build_status;
+typedef cl_uint cl_kernel_info;
+typedef cl_uint cl_kernel_work_group_info;
+typedef cl_uint cl_event_info;
+typedef cl_uint cl_command_type;
+typedef cl_uint cl_profiling_info;
+
+typedef struct _cl_image_format {
+ cl_channel_order image_channel_order;
+ cl_channel_type image_channel_data_type;
+} cl_image_format;
+
+
+typedef struct _cl_buffer_region {
+ size_t origin;
+ size_t size;
+} cl_buffer_region;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS 0
+#define CL_DEVICE_NOT_FOUND -1
+#define CL_DEVICE_NOT_AVAILABLE -2
+#define CL_COMPILER_NOT_AVAILABLE -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
+#define CL_OUT_OF_RESOURCES -5
+#define CL_OUT_OF_HOST_MEMORY -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE -7
+#define CL_MEM_COPY_OVERLAP -8
+#define CL_IMAGE_FORMAT_MISMATCH -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
+#define CL_BUILD_PROGRAM_FAILURE -11
+#define CL_MAP_FAILURE -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+
+#define CL_INVALID_VALUE -30
+#define CL_INVALID_DEVICE_TYPE -31
+#define CL_INVALID_PLATFORM -32
+#define CL_INVALID_DEVICE -33
+#define CL_INVALID_CONTEXT -34
+#define CL_INVALID_QUEUE_PROPERTIES -35
+#define CL_INVALID_COMMAND_QUEUE -36
+#define CL_INVALID_HOST_PTR -37
+#define CL_INVALID_MEM_OBJECT -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
+#define CL_INVALID_IMAGE_SIZE -40
+#define CL_INVALID_SAMPLER -41
+#define CL_INVALID_BINARY -42
+#define CL_INVALID_BUILD_OPTIONS -43
+#define CL_INVALID_PROGRAM -44
+#define CL_INVALID_PROGRAM_EXECUTABLE -45
+#define CL_INVALID_KERNEL_NAME -46
+#define CL_INVALID_KERNEL_DEFINITION -47
+#define CL_INVALID_KERNEL -48
+#define CL_INVALID_ARG_INDEX -49
+#define CL_INVALID_ARG_VALUE -50
+#define CL_INVALID_ARG_SIZE -51
+#define CL_INVALID_KERNEL_ARGS -52
+#define CL_INVALID_WORK_DIMENSION -53
+#define CL_INVALID_WORK_GROUP_SIZE -54
+#define CL_INVALID_WORK_ITEM_SIZE -55
+#define CL_INVALID_GLOBAL_OFFSET -56
+#define CL_INVALID_EVENT_WAIT_LIST -57
+#define CL_INVALID_EVENT -58
+#define CL_INVALID_OPERATION -59
+#define CL_INVALID_GL_OBJECT -60
+#define CL_INVALID_BUFFER_SIZE -61
+#define CL_INVALID_MIP_LEVEL -62
+#define CL_INVALID_GLOBAL_WORK_SIZE -63
+#define CL_INVALID_PROPERTY -64
+
+/* OpenCL Version */
+#define CL_VERSION_1_0 1
+#define CL_VERSION_1_1 1
+
+/* cl_bool */
+#define CL_FALSE 0
+#define CL_TRUE 1
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE 0x0900
+#define CL_PLATFORM_VERSION 0x0901
+#define CL_PLATFORM_NAME 0x0902
+#define CL_PLATFORM_VENDOR 0x0903
+#define CL_PLATFORM_EXTENSIONS 0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
+#define CL_DEVICE_TYPE_CPU (1 << 1)
+#define CL_DEVICE_TYPE_GPU (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
+#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE 0x1000
+#define CL_DEVICE_VENDOR_ID 0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
+#define CL_DEVICE_ADDRESS_BITS 0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT 0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
+#define CL_DEVICE_MAX_SAMPLERS 0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
+#define CL_DEVICE_ENDIAN_LITTLE 0x1026
+#define CL_DEVICE_AVAILABLE 0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
+#define CL_DEVICE_NAME 0x102B
+#define CL_DEVICE_VENDOR 0x102C
+#define CL_DRIVER_VERSION 0x102D
+#define CL_DEVICE_PROFILE 0x102E
+#define CL_DEVICE_VERSION 0x102F
+#define CL_DEVICE_EXTENSIONS 0x1030
+#define CL_DEVICE_PLATFORM 0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
+#define CL_DEVICE_OPENCL_C_VERSION 0x103D
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM (1 << 0)
+#define CL_FP_INF_NAN (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST (1 << 2)
+#define CL_FP_ROUND_TO_ZERO (1 << 3)
+#define CL_FP_ROUND_TO_INF (1 << 4)
+#define CL_FP_FMA (1 << 5)
+#define CL_FP_SOFT_FLOAT (1 << 6)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE 0x0
+#define CL_READ_ONLY_CACHE 0x1
+#define CL_READ_WRITE_CACHE 0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL 0x1
+#define CL_GLOBAL 0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT 0x1080
+#define CL_CONTEXT_DEVICES 0x1081
+#define CL_CONTEXT_PROPERTIES 0x1082
+#define CL_CONTEXT_NUM_DEVICES 0x1083
+
+/* cl_context_info + cl_context_properties */
+#define CL_CONTEXT_PLATFORM 0x1084
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT 0x1090
+#define CL_QUEUE_DEVICE 0x1091
+#define CL_QUEUE_REFERENCE_COUNT 0x1092
+#define CL_QUEUE_PROPERTIES 0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE (1 << 0)
+#define CL_MEM_WRITE_ONLY (1 << 1)
+#define CL_MEM_READ_ONLY (1 << 2)
+#define CL_MEM_USE_HOST_PTR (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
+#define CL_MEM_COPY_HOST_PTR (1 << 5)
+
+/* cl_channel_order */
+#define CL_R 0x10B0
+#define CL_A 0x10B1
+#define CL_RG 0x10B2
+#define CL_RA 0x10B3
+#define CL_RGB 0x10B4
+#define CL_RGBA 0x10B5
+#define CL_BGRA 0x10B6
+#define CL_ARGB 0x10B7
+#define CL_INTENSITY 0x10B8
+#define CL_LUMINANCE 0x10B9
+#define CL_Rx 0x10BA
+#define CL_RGx 0x10BB
+#define CL_RGBx 0x10BC
+
+/* cl_channel_type */
+#define CL_SNORM_INT8 0x10D0
+#define CL_SNORM_INT16 0x10D1
+#define CL_UNORM_INT8 0x10D2
+#define CL_UNORM_INT16 0x10D3
+#define CL_UNORM_SHORT_565 0x10D4
+#define CL_UNORM_SHORT_555 0x10D5
+#define CL_UNORM_INT_101010 0x10D6
+#define CL_SIGNED_INT8 0x10D7
+#define CL_SIGNED_INT16 0x10D8
+#define CL_SIGNED_INT32 0x10D9
+#define CL_UNSIGNED_INT8 0x10DA
+#define CL_UNSIGNED_INT16 0x10DB
+#define CL_UNSIGNED_INT32 0x10DC
+#define CL_HALF_FLOAT 0x10DD
+#define CL_FLOAT 0x10DE
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER 0x10F0
+#define CL_MEM_OBJECT_IMAGE2D 0x10F1
+#define CL_MEM_OBJECT_IMAGE3D 0x10F2
+
+/* cl_mem_info */
+#define CL_MEM_TYPE 0x1100
+#define CL_MEM_FLAGS 0x1101
+#define CL_MEM_SIZE 0x1102
+#define CL_MEM_HOST_PTR 0x1103
+#define CL_MEM_MAP_COUNT 0x1104
+#define CL_MEM_REFERENCE_COUNT 0x1105
+#define CL_MEM_CONTEXT 0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
+#define CL_MEM_OFFSET 0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT 0x1110
+#define CL_IMAGE_ELEMENT_SIZE 0x1111
+#define CL_IMAGE_ROW_PITCH 0x1112
+#define CL_IMAGE_SLICE_PITCH 0x1113
+#define CL_IMAGE_WIDTH 0x1114
+#define CL_IMAGE_HEIGHT 0x1115
+#define CL_IMAGE_DEPTH 0x1116
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE 0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
+#define CL_ADDRESS_CLAMP 0x1132
+#define CL_ADDRESS_REPEAT 0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST 0x1140
+#define CL_FILTER_LINEAR 0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT 0x1150
+#define CL_SAMPLER_CONTEXT 0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
+#define CL_SAMPLER_ADDRESSING_MODE 0x1153
+#define CL_SAMPLER_FILTER_MODE 0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ (1 << 0)
+#define CL_MAP_WRITE (1 << 1)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT 0x1160
+#define CL_PROGRAM_CONTEXT 0x1161
+#define CL_PROGRAM_NUM_DEVICES 0x1162
+#define CL_PROGRAM_DEVICES 0x1163
+#define CL_PROGRAM_SOURCE 0x1164
+#define CL_PROGRAM_BINARY_SIZES 0x1165
+#define CL_PROGRAM_BINARIES 0x1166
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS 0x1181
+#define CL_PROGRAM_BUILD_OPTIONS 0x1182
+#define CL_PROGRAM_BUILD_LOG 0x1183
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS 0
+#define CL_BUILD_NONE -1
+#define CL_BUILD_ERROR -2
+#define CL_BUILD_IN_PROGRESS -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME 0x1190
+#define CL_KERNEL_NUM_ARGS 0x1191
+#define CL_KERNEL_REFERENCE_COUNT 0x1192
+#define CL_KERNEL_CONTEXT 0x1193
+#define CL_KERNEL_PROGRAM 0x1194
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE 0x11D0
+#define CL_EVENT_COMMAND_TYPE 0x11D1
+#define CL_EVENT_REFERENCE_COUNT 0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
+#define CL_EVENT_CONTEXT 0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
+#define CL_COMMAND_TASK 0x11F1
+#define CL_COMMAND_NATIVE_KERNEL 0x11F2
+#define CL_COMMAND_READ_BUFFER 0x11F3
+#define CL_COMMAND_WRITE_BUFFER 0x11F4
+#define CL_COMMAND_COPY_BUFFER 0x11F5
+#define CL_COMMAND_READ_IMAGE 0x11F6
+#define CL_COMMAND_WRITE_IMAGE 0x11F7
+#define CL_COMMAND_COPY_IMAGE 0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
+#define CL_COMMAND_MAP_BUFFER 0x11FB
+#define CL_COMMAND_MAP_IMAGE 0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
+#define CL_COMMAND_MARKER 0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
+#define CL_COMMAND_READ_BUFFER_RECT 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
+#define CL_COMMAND_USER 0x1204
+
+/* command execution status */
+#define CL_COMPLETE 0x0
+#define CL_RUNNING 0x1
+#define CL_SUBMITTED 0x2
+#define CL_QUEUED 0x3
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT 0x1281
+#define CL_PROFILING_COMMAND_START 0x1282
+#define CL_PROFILING_COMMAND_END 0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id /* platform */,
+ cl_platform_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id /* platform */,
+ cl_device_type /* device_type */,
+ cl_uint /* num_entries */,
+ cl_device_id * /* devices */,
+ cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id /* device */,
+ cl_device_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+ cl_device_type /* device_type */,
+ void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context /* context */,
+ cl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue /* command_queue */,
+ cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
+/*
+ * WARNING:
+ * This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+ * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the
+ * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+ * It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+ *
+ * Software developers previously relying on this API are instructed to set the command queue
+ * properties when creating the queue, instead.
+ */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty(cl_command_queue /* command_queue */,
+ cl_command_queue_properties /* properties */,
+ cl_bool /* enable */,
+ cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ size_t /* size */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem /* buffer */,
+ cl_mem_flags /* flags */,
+ cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_depth */,
+ size_t /* image_row_pitch */,
+ size_t /* image_slice_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */,
+ cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem /* memobj */,
+ cl_mem_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem /* image */,
+ cl_image_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback( cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler /* sampler */,
+ cl_sampler_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context /* context */,
+ cl_uint /* count */,
+ const char ** /* strings */,
+ const size_t * /* lengths */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const size_t * /* lengths */,
+ const unsigned char ** /* binaries */,
+ cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program /* program */,
+ cl_program_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program /* program */,
+ cl_device_id /* device */,
+ cl_program_build_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program /* program */,
+ const char * /* kernel_name */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program /* program */,
+ cl_uint /* num_kernels */,
+ cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ size_t /* arg_size */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel /* kernel */,
+ cl_kernel_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
+ cl_device_id /* device */,
+ cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event /* event */,
+ cl_event_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context /* context */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event /* event */,
+ cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event /* event */,
+ cl_int /* command_exec_callback_type */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event /* event */,
+ cl_profiling_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ size_t /* offset */,
+ size_t /* cb */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ const size_t * /* buffer_origin */,
+ const size_t * /* host_origin */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ size_t /* offset */,
+ size_t /* cb */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ const size_t * /* buffer_origin */,
+ const size_t * /* host_origin */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ size_t /* src_offset */,
+ size_t /* dst_offset */,
+ size_t /* cb */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */,
+ const size_t * /* dst_origin */,
+ const size_t * /* region */,
+ size_t /* src_row_pitch */,
+ size_t /* src_slice_pitch */,
+ size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* row_pitch */,
+ size_t /* slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_image */,
+ size_t /* src_offset */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ size_t /* offset */,
+ size_t /* cb */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */,
+ size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+ cl_mem /* memobj */,
+ void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */,
+ const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue /* command_queue */,
+ void (CL_CALLBACK *user_func)(void *),
+ void * /* args */,
+ size_t /* cb_args */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */,
+ const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue /* command_queue */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found. The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_H */
+
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
new file mode 100644
index 0000000..deee3f7
--- /dev/null
+++ b/include/CL/cl.hpp
@@ -0,0 +1,4014 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ * \brief C++ bindings for OpenCL 1.0 (rev 48) and OpenCL 1.1 (rev 33)
+ * \author Benedict R. Gaster and Laurent Morichetti
+ *
+ * Additions and fixes from Brian Cole, March 3rd 2010.
+ *
+ * \version 1.1
+ * \date June 2010
+ *
+ * Optional extension support
+ *
+ * cl
+ * cl_ext_device_fission
+ * #define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.1 (revision 04)
+ * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.1.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ *
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ *
+ * const char * helloStr = "__kernel void "
+ * "hello(void) "
+ * "{ "
+ * " "
+ * "} ";
+ *
+ * int
+ * main(void)
+ * {
+ * cl_int err = CL_SUCCESS;
+ * try {
+ *
+ * std::vector<cl::Platform> platforms;
+ * cl::Platform::get(&platforms);
+ * if (platforms.size() == 0) {
+ * std::cout << "Platform size 0\n";
+ * return -1;
+ * }
+ *
+ * cl_context_properties properties[] =
+ * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ * cl::Context context(CL_DEVICE_TYPE_CPU, properties);
+ *
+ * std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ *
+ * cl::Program::Sources source(1,
+ * std::make_pair(helloStr,strlen(helloStr)));
+ * cl::Program program_ = cl::Program(context, source);
+ * program_.build(devices);
+ *
+ * cl::Kernel kernel(program_, "hello", &err);
+ *
+ * cl::Event event;
+ * cl::CommandQueue queue(context, devices[0], 0, &err);
+ * queue.enqueueNDRangeKernel(
+ * kernel,
+ * cl::NullRange,
+ * cl::NDRange(4,4),
+ * cl::NullRange,
+ * NULL,
+ * &event);
+ *
+ * event.wait();
+ * }
+ * catch (cl::Error err) {
+ * std::cerr
+ * << "ERROR: "
+ * << err.what()
+ * << "("
+ * << err.err()
+ * << ")"
+ * << std::endl;
+ * }
+ *
+ * return EXIT_SUCCESS;
+ * }
+ *
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+#include <windows.h>
+#include <malloc.h>
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#endif
+#endif // _WIN32
+
+//
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+# include <alloca.h>
+#endif // linux
+
+#include <cstring>
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+#define __INIT_CL_EXT_FCN_PTR(name) \
+ if(!pfn_##name) { \
+ pfn_##name = (PFN_##name) \
+ clGetExtensionFunctionAddress(#name); \
+ if(!pfn_##name) { \
+ } \
+ }
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+/*! \class Error
+ * \brief Exception class
+ */
+class Error : public std::exception
+{
+private:
+ cl_int err_;
+ const char * errStr_;
+public:
+ /*! Create a new CL error exception for a given error code
+ * and corresponding message.
+ */
+ Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+ {}
+
+ ~Error() throw() {}
+
+ /*! \brief Get error string associated with exception
+ *
+ * \return A memory pointer to the error message string.
+ */
+ virtual const char * what() const throw ()
+ {
+ if (errStr_ == NULL) {
+ return "empty";
+ }
+ else {
+ return errStr_;
+ }
+ }
+
+ /*! \brief Get error code associated with exception
+ *
+ * \return The error code.
+ */
+ const cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR __ERR_STR(clgetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer)
+#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo)
+#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D)
+#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary)
+#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel)
+#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects)
+
+#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler)
+
+#define __FLUSH_ERR __ERR_STR(clFlush)
+#define __FINISH_ERR __ERR_STR(clFinish)
+
+#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT)
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ */
+class string
+{
+private:
+ ::size_t size_;
+ char * str_;
+public:
+ string(void) : size_(0), str_(NULL)
+ {
+ }
+
+ string(char * str, ::size_t size) :
+ size_(size),
+ str_(NULL)
+ {
+ str_ = new char[size_+1];
+ if (str_ != NULL) {
+ memcpy(str_, str, size_ * sizeof(char));
+ str_[size_] = '\0';
+ }
+ else {
+ size_ = 0;
+ }
+ }
+
+ string(char * str) :
+ str_(NULL)
+ {
+ size_= ::strlen(str);
+ str_ = new char[size_ + 1];
+ if (str_ != NULL) {
+ memcpy(str_, str, (size_ + 1) * sizeof(char));
+ }
+ else {
+ size_ = 0;
+ }
+ }
+
+ string& operator=(const string& rhs)
+ {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ if (rhs.size_ == 0 || rhs.str_ == NULL) {
+ size_ = 0;
+ str_ = NULL;
+ }
+ else {
+ size_ = rhs.size_;
+ str_ = new char[size_ + 1];
+ if (str_ != NULL) {
+ memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+ }
+ else {
+ size_ = 0;
+ }
+ }
+
+ return *this;
+ }
+
+ string(const string& rhs)
+ {
+ *this = rhs;
+ }
+
+ ~string()
+ {
+ if (str_ != NULL) {
+ delete[] str_;
+ }
+ }
+
+ ::size_t size(void) const { return size_; }
+ ::size_t length(void) const { return size(); }
+
+ const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+#include <string>
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING)
+typedef cl::string STRING_CLASS;
+#endif
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#include <vector>
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR)
+#define VECTOR_CLASS cl::vector
+#endif
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring
+ * std::vector functionality.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class vector
+{
+private:
+ T data_[N];
+ unsigned int size_;
+ bool empty_;
+public:
+ vector() :
+ size_(-1),
+ empty_(true)
+ {}
+
+ ~vector() {}
+
+ unsigned int size(void) const
+ {
+ return size_ + 1;
+ }
+
+ void clear()
+ {
+ size_ = -1;
+ empty_ = true;
+ }
+
+ void push_back (const T& x)
+ {
+ if (size() < N) {
+ size_++;
+ data_[size_] = x;
+ empty_ = false;
+ }
+ }
+
+ void pop_back(void)
+ {
+ if (!empty_) {
+ data_[size_].~T();
+ size_--;
+ if (size_ == -1) {
+ empty_ = true;
+ }
+ }
+ }
+
+ vector(const vector<T, N>& vec) :
+ size_(vec.size_),
+ empty_(vec.empty_)
+ {
+ if (!empty_) {
+ memcpy(&data_[0], &vec.data_[0], size() * sizeof(T));
+ }
+ }
+
+ vector(unsigned int size, const T& val = T()) :
+ size_(-1),
+ empty_(true)
+ {
+ for (unsigned int i = 0; i < size; i++) {
+ push_back(val);
+ }
+ }
+
+ vector<T, N>& operator=(const vector<T, N>& rhs)
+ {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ size_ = rhs.size_;
+ empty_ = rhs.empty_;
+
+ if (!empty_) {
+ memcpy(&data_[0], &rhs.data_[0], size() * sizeof(T));
+ }
+
+ return *this;
+ }
+
+ bool operator==(vector<T,N> &vec)
+ {
+ if (empty_ && vec.empty_) {
+ return true;
+ }
+
+ if (size() != vec.size()) {
+ return false;
+ }
+
+ return memcmp(&data_[0], &vec.data_[0], size() * sizeof(T)) == 0 ? true : false;
+ }
+
+ operator T* () { return data_; }
+ operator const T* () const { return data_; }
+
+ bool empty (void) const
+ {
+ return empty_;
+ }
+
+ unsigned int max_size (void) const
+ {
+ return N;
+ }
+
+ unsigned int capacity () const
+ {
+ return sizeof(T) * N;
+ }
+
+ T& operator[](int index)
+ {
+ return data_[index];
+ }
+
+ T operator[](int index) const
+ {
+ return data_[index];
+ }
+
+ template<class I>
+ void assign(I start, I end)
+ {
+ clear();
+ while(start < end) {
+ push_back(*start);
+ start++;
+ }
+ }
+
+ /*! \class iterator
+ * \brief Iterator class for vectors
+ */
+ class iterator
+ {
+ private:
+ vector<T,N> vec_;
+ int index_;
+ bool initialized_;
+ public:
+ iterator(void) :
+ index_(-1),
+ initialized_(false)
+ {
+ index_ = -1;
+ initialized_ = false;
+ }
+
+ ~iterator(void) {}
+
+ static iterator begin(vector<T,N> &vec)
+ {
+ iterator i;
+
+ if (!vec.empty()) {
+ i.index_ = 0;
+ }
+
+ i.vec_ = vec;
+ i.initialized_ = true;
+ return i;
+ }
+
+ static iterator end(vector<T,N> &vec)
+ {
+ iterator i;
+
+ if (!vec.empty()) {
+ i.index_ = vec.size();
+ }
+ i.vec_ = vec;
+ i.initialized_ = true;
+ return i;
+ }
+
+ bool operator==(iterator i)
+ {
+ return ((vec_ == i.vec_) &&
+ (index_ == i.index_) &&
+ (initialized_ == i.initialized_));
+ }
+
+ bool operator!=(iterator i)
+ {
+ return (!(*this==i));
+ }
+
+ void operator++()
+ {
+ index_++;
+ }
+
+ void operator++(int x)
+ {
+ index_ += x;
+ }
+
+ void operator--()
+ {
+ index_--;
+ }
+
+ void operator--(int x)
+ {
+ index_ -= x;
+ }
+
+ T operator *()
+ {
+ return vec_[index_];
+ }
+ };
+
+ iterator begin(void)
+ {
+ return iterator::begin(*this);
+ }
+
+ iterator end(void)
+ {
+ return iterator::end(*this);
+ }
+
+ T& front(void)
+ {
+ return data_[0];
+ }
+
+ T& back(void)
+ {
+ return data_[size_];
+ }
+
+ const T& front(void) const
+ {
+ return data_[0];
+ }
+
+ const T& back(void) const
+ {
+ return data_[size_];
+ }
+};
+
+/*!
+ * \brief size_t class used to interface between C++ and
+ * OpenCL C calls that require arrays of size_t values, who's
+ * size is known statically.
+ */
+template <int N>
+struct size_t : public cl::vector< ::size_t, N> { };
+
+namespace detail {
+
+// GetInfo help struct
+template <typename Functor, typename T>
+struct GetInfoHelper
+{
+ static cl_int
+ get(Functor f, cl_uint name, T* param)
+ {
+ return f(name, sizeof(T), param, NULL);
+ }
+};
+
+// Specialized GetInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+struct GetInfoHelper<Func, VECTOR_CLASS<T> >
+{
+ static cl_int get(Func f, cl_uint name, VECTOR_CLASS<T>* param)
+ {
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ T* value = (T*) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ param->assign(&value[0], &value[required/sizeof(T)]);
+ return CL_SUCCESS;
+ }
+};
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+struct GetInfoHelper<Func, VECTOR_CLASS<char *> >
+{
+ static cl_int
+ get(Func f, cl_uint name, VECTOR_CLASS<char *>* param)
+ {
+ cl_uint err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ return CL_SUCCESS;
+ }
+};
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+struct GetInfoHelper<Func, STRING_CLASS>
+{
+ static cl_int get(Func f, cl_uint name, STRING_CLASS* param)
+ {
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ char* value = (char*) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ *param = value;
+ return CL_SUCCESS;
+ }
+};
+
+#define __GET_INFO_HELPER_WITH_RETAIN(CPP_TYPE) \
+namespace detail { \
+template <typename Func> \
+struct GetInfoHelper<Func, CPP_TYPE> \
+{ \
+ static cl_int get(Func f, cl_uint name, CPP_TYPE* param) \
+ { \
+ cl_uint err = f(name, sizeof(CPP_TYPE), param, NULL); \
+ if (err != CL_SUCCESS) { \
+ return err; \
+ } \
+ \
+ return ReferenceHandler<CPP_TYPE::cl_type>::retain((*param)()); \
+ } \
+}; \
+}
+
+
+#define __PARAM_NAME_INFO_1_0(F) \
+ F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+ \
+ F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+ F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+ F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_bitfield) \
+ F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+ F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+ F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+ F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+ F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+ F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+ F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+ F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+ F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+ \
+ F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+ F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+ F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+ \
+ F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+ F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+ F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+ F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+ \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+ \
+ F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+ F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+ F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+ F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+ F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+ F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+ F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+ \
+ F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+ F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+ F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+ F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+ \
+ F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+ F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+ F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+ F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+ F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+ \
+ F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+ F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+ F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+ F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<cl_device_id>) \
+ F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+ F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+ F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+ \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+ \
+ F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+ F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+ F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+ F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+ F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+ \
+ F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+ F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+ F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+ \
+ F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+ F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+ F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+ F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+ F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+ F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+ \
+ F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+ F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+ \
+ F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+ F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+ \
+ F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+ F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+ F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+ F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+ F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token; \
+template<> \
+struct param_traits<detail:: token,param_name> \
+{ \
+ enum { value = param_name }; \
+ typedef T param_type; \
+};
+
+__PARAM_NAME_INFO_1_0(__DECLARE_PARAM_TRAITS);
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__DECLARE_PARAM_TRAITS);
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DECLARE_PARAM_TRAITS
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+ return GetInfoHelper<Func, T>::get(f, name, param);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+ Func f_; const Arg0& arg0_;
+ cl_int operator ()(
+ cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+ { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+ Func f_; const Arg0& arg0_; const Arg1& arg1_;
+ cl_int operator ()(
+ cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+ { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+ GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+ return GetInfoHelper<GetInfoFunctor0<Func, Arg0>, T>
+ ::get(f0, name, param);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+ GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+ return GetInfoHelper<GetInfoFunctor1<Func, Arg0, Arg1>, T>
+ ::get(f0, name, param);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+ // cl_device_id does not have retain().
+ static cl_int retain(cl_device_id)
+ { return CL_INVALID_DEVICE; }
+ // cl_device_id does not have release().
+ static cl_int release(cl_device_id)
+ { return CL_INVALID_DEVICE; }
+};
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+ // cl_platform_id does not have retain().
+ static cl_int retain(cl_platform_id)
+ { return CL_INVALID_PLATFORM; }
+ // cl_platform_id does not have release().
+ static cl_int release(cl_platform_id)
+ { return CL_INVALID_PLATFORM; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+ static cl_int retain(cl_context context)
+ { return ::clRetainContext(context); }
+ static cl_int release(cl_context context)
+ { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+ static cl_int retain(cl_command_queue queue)
+ { return ::clRetainCommandQueue(queue); }
+ static cl_int release(cl_command_queue queue)
+ { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+ static cl_int retain(cl_mem memory)
+ { return ::clRetainMemObject(memory); }
+ static cl_int release(cl_mem memory)
+ { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+ static cl_int retain(cl_sampler sampler)
+ { return ::clRetainSampler(sampler); }
+ static cl_int release(cl_sampler sampler)
+ { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+ static cl_int retain(cl_program program)
+ { return ::clRetainProgram(program); }
+ static cl_int release(cl_program program)
+ { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+ static cl_int retain(cl_kernel kernel)
+ { return ::clRetainKernel(kernel); }
+ static cl_int release(cl_kernel kernel)
+ { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+ static cl_int retain(cl_event event)
+ { return ::clRetainEvent(event); }
+ static cl_int release(cl_event event)
+ { return ::clReleaseEvent(event); }
+};
+
+template <typename T>
+class Wrapper
+{
+public:
+ typedef T cl_type;
+
+protected:
+ cl_type object_;
+
+public:
+ Wrapper() : object_(NULL) { }
+
+ ~Wrapper()
+ {
+ if (object_ != NULL) { release(); }
+ }
+
+ Wrapper(const Wrapper<cl_type>& rhs)
+ {
+ object_ = rhs.object_;
+ if (object_ != NULL) { retain(); }
+ }
+
+ Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+ {
+ if (object_ != NULL) { release(); }
+ object_ = rhs.object_;
+ if (object_ != NULL) { retain(); }
+ return *this;
+ }
+
+ cl_type operator ()() const { return object_; }
+
+ cl_type& operator ()() { return object_; }
+
+protected:
+
+ cl_int retain() const
+ {
+ return ReferenceHandler<cl_type>::retain(object_);
+ }
+
+ cl_int release() const
+ {
+ return ReferenceHandler<cl_type>::release(object_);
+ }
+};
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+ cl_int err,
+ const char * errStr = NULL) throw(Error)
+{
+ if (err != CL_SUCCESS) {
+ throw Error(err, errStr);
+ }
+ return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+ return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ * \brief ImageFormat interface fro cl_image_format.
+ */
+struct ImageFormat : public cl_image_format
+{
+ ImageFormat(){}
+
+ ImageFormat(cl_channel_order order, cl_channel_type type)
+ {
+ image_channel_order = order;
+ image_channel_data_type = type;
+ }
+
+ ImageFormat& operator = (const ImageFormat& rhs)
+ {
+ if (this != &rhs) {
+ this->image_channel_data_type = rhs.image_channel_data_type;
+ this->image_channel_order = rhs.image_channel_order;
+ }
+ return *this;
+ }
+};
+
+/*! \class Device
+ * \brief Device interface for cl_device_id.
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+ Device(cl_device_id device) { object_ = device; }
+
+ Device() : detail::Wrapper<cl_type>() { }
+
+ Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+ Device& operator = (const Device& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_device_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+ __GET_DEVICE_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_device_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_device_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+#if defined(USE_CL_DEVICE_FISSION)
+ cl_int createSubDevices(
+ const cl_device_partition_property_ext * properties,
+ VECTOR_CLASS<Device>* devices)
+ {
+ typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+ cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+ __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+ cl_uint n = 0;
+ cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+#endif
+};
+
+/*! \class Platform
+ * \brief Platform interface.
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+ static const Platform null();
+
+ Platform(cl_platform_id platform) { object_ = platform; }
+
+ Platform() : detail::Wrapper<cl_type>() { }
+
+ Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+ Platform& operator = (const Platform& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+ __GET_PLATFORM_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_platform_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_platform_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int getDevices(
+ cl_device_type type,
+ VECTOR_CLASS<Device>* devices) const
+ {
+ cl_uint n = 0;
+ cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+
+#if defined(USE_DX_INTEROP)
+ /*! \brief Get the list of available D3D10 devices.
+ *
+ * \param d3d_device_source.
+ *
+ * \param d3d_object.
+ *
+ * \param d3d_device_set.
+ *
+ * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+ * values returned in devices can be used to identify a specific OpenCL
+ * device. If \a devices argument is NULL, this argument is ignored.
+ *
+ * \return One of the following values:
+ * - CL_SUCCESS if the function is executed successfully.
+ *
+ * The application can query specific capabilities of the OpenCL device(s)
+ * returned by cl::getDevices. This can be used by the application to
+ * determine which device(s) to use.
+ *
+ * \note In the case that exceptions are enabled and a return value
+ * other than CL_SUCCESS is generated, then cl::Error exception is
+ * generated.
+ */
+ cl_int getDevices(
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ VECTOR_CLASS<Device>* devices) const
+ {
+ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint* num_devices);
+
+ static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+ __INIT_CL_EXT_FCN_PTR(clGetDeviceIDsFromD3D10KHR);
+
+ cl_uint n = 0;
+ cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+ object_,
+ d3d_device_source,
+ d3d_object,
+ d3d_device_set,
+ 0,
+ NULL,
+ &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = pfn_clGetDeviceIDsFromD3D10KHR(
+ object_,
+ d3d_device_source,
+ d3d_object,
+ d3d_device_set,
+ n,
+ ids,
+ NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+#endif
+
+ static cl_int get(
+ VECTOR_CLASS<Platform>* platforms)
+ {
+ cl_uint n = 0;
+ cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ cl_platform_id* ids = (cl_platform_id*) alloca(
+ n * sizeof(cl_platform_id));
+ err = ::clGetPlatformIDs(n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ platforms->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+};
+
+static inline cl_int
+UnloadCompiler()
+{
+ return ::clUnloadCompiler();
+}
+
+class Context : public detail::Wrapper<cl_context>
+{
+public:
+ Context(
+ const VECTOR_CLASS<Device>& devices,
+ cl_context_properties* properties = NULL,
+ void (CL_CALLBACK * notifyFptr)(
+ const char *,
+ const void *,
+ ::size_t,
+ void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateContext(
+ properties, (cl_uint) devices.size(),
+ (cl_device_id*) &devices.front(),
+ notifyFptr, data, &error);
+
+ detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Context(
+ cl_device_type type,
+ cl_context_properties* properties = NULL,
+ void (CL_CALLBACK * notifyFptr)(
+ const char *,
+ const void *,
+ ::size_t,
+ void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateContextFromType(
+ properties, type, notifyFptr, data, &error);
+
+ detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Context() : detail::Wrapper<cl_type>() { }
+
+ Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+ Context& operator = (const Context& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_context_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetContextInfo, object_, name, param),
+ __GET_CONTEXT_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_context_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_context_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int getSupportedImageFormats(
+ cl_mem_flags flags,
+ cl_mem_object_type type,
+ VECTOR_CLASS<ImageFormat>* formats) const
+ {
+ cl_uint numEntries;
+ cl_int err = ::clGetSupportedImageFormats(
+ object_,
+ flags,
+ type,
+ 0,
+ NULL,
+ &numEntries);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+ }
+
+ ImageFormat* value = (ImageFormat*)
+ alloca(numEntries * sizeof(ImageFormat));
+ err = ::clGetSupportedImageFormats(
+ object_,
+ flags,
+ type,
+ numEntries,
+ (cl_image_format*) value,
+ NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+ }
+
+ formats->assign(&value[0], &value[numEntries]);
+ return CL_SUCCESS;
+ }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Context)
+
+/*! \class Event
+ * \brief Event interface for cl_event.
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+ Event() : detail::Wrapper<cl_type>() { }
+
+ Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+ Event& operator = (const Event& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_event_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetEventInfo, object_, name, param),
+ __GET_EVENT_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_event_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_event_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+ {
+ return detail::errHandler(detail::getInfo(
+ &::clGetEventProfilingInfo, object_, name, param),
+ __GET_EVENT_PROFILE_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_profiling_info, name>::param_type
+ getProfilingInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_profiling_info, name>::param_type param;
+ cl_int result = getProfilingInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int wait() const
+ {
+ return detail::errHandler(
+ ::clWaitForEvents(1, &object_),
+ __WAIT_FOR_EVENTS_ERR);
+ }
+
+#if defined(CL_VERSION_1_1)
+ cl_int setCallback(
+ cl_int type,
+ void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
+ void * user_data = NULL)
+ {
+ return detail::errHandler(
+ ::clSetEventCallback(
+ object_,
+ type,
+ pfn_notify,
+ user_data),
+ __SET_EVENT_CALLBACK_ERR);
+ }
+#endif
+
+ static cl_int
+ waitForEvents(const VECTOR_CLASS<Event>& events)
+ {
+ return detail::errHandler(
+ ::clWaitForEvents(
+ (cl_uint) events.size(), (cl_event*)&events.front()),
+ __WAIT_FOR_EVENTS_ERR);
+ }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Event)
+
+#if defined(CL_VERSION_1_1)
+/*! \class UserEvent
+ * \brief User event interface for cl_event.
+ */
+class UserEvent : public Event
+{
+public:
+ UserEvent(
+ const Context& context,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateUserEvent(
+ context(),
+ &error);
+
+ detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ UserEvent() : Event() { }
+
+ UserEvent(const UserEvent& event) : Event(event) { }
+
+ UserEvent& operator = (const UserEvent& rhs)
+ {
+ if (this != &rhs) {
+ Event::operator=(rhs);
+ }
+ return *this;
+ }
+
+ cl_int setStatus(cl_int status)
+ {
+ return detail::errHandler(
+ ::clSetUserEventStatus(object_,status),
+ __SET_USER_EVENT_STATUS_ERR);
+ }
+};
+#endif
+
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+ return detail::errHandler(
+ ::clWaitForEvents(
+ (cl_uint) events.size(), (cl_event*)&events.front()),
+ __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \class Memory
+ * \brief Memory interface for cl_mem.
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+ Memory() : detail::Wrapper<cl_type>() { }
+
+ Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+ Memory& operator = (const Memory& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_mem_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+ __GET_MEM_OBJECT_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_mem_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_mem_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+#if defined(CL_VERSION_1_1)
+ cl_int setDestructorCallback(
+ void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
+ void * user_data = NULL)
+ {
+ return detail::errHandler(
+ ::clSetMemObjectDestructorCallback(
+ object_,
+ pfn_notify,
+ user_data),
+ __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+ }
+#endif
+
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Memory)
+
+/*! \class Buffer
+ * \brief Memory buffer interface.
+ */
+class Buffer : public Memory
+{
+public:
+ Buffer(
+ const Context& context,
+ cl_mem_flags flags,
+ ::size_t size,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Buffer() : Memory() { }
+
+ Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+ Buffer& operator = (const Buffer& rhs)
+ {
+ if (this != &rhs) {
+ Memory::operator=(rhs);
+ }
+ return *this;
+ }
+
+#if defined(CL_VERSION_1_1)
+ Buffer createSubBuffer(
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * err = NULL)
+ {
+ Buffer result;
+ cl_int error;
+ result.object_ = ::clCreateSubBuffer(
+ object_,
+ flags,
+ buffer_create_type,
+ buffer_create_info,
+ &error);
+
+ detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ return result;
+ }
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+class BufferD3D10 : public Buffer
+{
+public:
+ typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+ cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer,
+ cl_int* errcode_ret);
+
+ BufferD3D10(
+ const Context& context,
+ cl_mem_flags flags,
+ ID3D10Buffer* bufobj,
+ cl_int * err = NULL)
+ {
+ static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+ __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+
+ cl_int error;
+ object_ = pfn_clCreateFromD3D10BufferKHR(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ BufferD3D10() : Buffer() { }
+
+ BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+ BufferD3D10& operator = (const BufferD3D10& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+};
+#endif
+
+/*! \class BufferGL
+ * \brief Memory buffer interface for GL interop.
+ */
+class BufferGL : public Buffer
+{
+public:
+ BufferGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLBuffer(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ BufferGL() : Buffer() { }
+
+ BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+ BufferGL& operator = (const BufferGL& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+
+ cl_int getObjectInfo(
+ cl_gl_object_type *type,
+ GLuint * gl_object_name)
+ {
+ return detail::errHandler(
+ ::clGetGLObjectInfo(object_,type,gl_object_name),
+ __GET_GL_OBJECT_INFO_ERR);
+ }
+};
+
+/*! \class BufferRenderGL
+ * \brief Memory buffer interface for GL interop with renderbuffer.
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+ BufferRenderGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLRenderbuffer(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ BufferRenderGL() : Buffer() { }
+
+ BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+ BufferRenderGL& operator = (const BufferRenderGL& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+
+ cl_int getObjectInfo(
+ cl_gl_object_type *type,
+ GLuint * gl_object_name)
+ {
+ return detail::errHandler(
+ ::clGetGLObjectInfo(object_,type,gl_object_name),
+ __GET_GL_OBJECT_INFO_ERR);
+ }
+};
+
+/*! \class Image
+ * \brief Base class interface for all images.
+ */
+class Image : public Memory
+{
+protected:
+ Image() : Memory() { }
+
+ Image(const Image& image) : Memory(image) { }
+
+ Image& operator = (const Image& rhs)
+ {
+ if (this != &rhs) {
+ Memory::operator=(rhs);
+ }
+ return *this;
+ }
+public:
+ template <typename T>
+ cl_int getImageInfo(cl_image_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetImageInfo, object_, name, param),
+ __GET_IMAGE_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_image_info, name>::param_type
+ getImageInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_image_info, name>::param_type param;
+ cl_int result = getImageInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+};
+
+/*! \class Image2D
+ * \brief Image interface for 2D images.
+ */
+class Image2D : public Image
+{
+public:
+ Image2D(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ ::size_t height,
+ ::size_t row_pitch = 0,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateImage2D(
+ context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image2D() { }
+
+ Image2D(const Image2D& image2D) : Image(image2D) { }
+
+ Image2D& operator = (const Image2D& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+};
+
+/*! \class Image2DGL
+ * \brief 2D image interface for GL interop.
+ */
+class Image2DGL : public Image2D
+{
+public:
+ Image2DGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLTexture2D(
+ context(),
+ flags,
+ target,
+ miplevel,
+ texobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image2DGL() : Image2D() { }
+
+ Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+ Image2DGL& operator = (const Image2DGL& rhs)
+ {
+ if (this != &rhs) {
+ Image2D::operator=(rhs);
+ }
+ return *this;
+ }
+};
+
+/*! \class Image3D
+ * \brief Image interface for 3D images.
+ */
+class Image3D : public Image
+{
+public:
+ Image3D(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ ::size_t height,
+ ::size_t depth,
+ ::size_t row_pitch = 0,
+ ::size_t slice_pitch = 0,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateImage3D(
+ context(), flags, &format, width, height, depth, row_pitch,
+ slice_pitch, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image3D() { }
+
+ Image3D(const Image3D& image3D) : Image(image3D) { }
+
+ Image3D& operator = (const Image3D& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+};
+
+/*! \class Image2DGL
+ * \brief 2D image interface for GL interop.
+ */
+class Image3DGL : public Image3D
+{
+public:
+ Image3DGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLTexture3D(
+ context(),
+ flags,
+ target,
+ miplevel,
+ texobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image3DGL() : Image3D() { }
+
+ Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+ Image3DGL& operator = (const Image3DGL& rhs)
+ {
+ if (this != &rhs) {
+ Image3D::operator=(rhs);
+ }
+ return *this;
+ }
+};
+
+/*! \class Sampler
+ * \brief Sampler interface for cl_sampler.
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+ Sampler() { }
+
+ Sampler(
+ const Context& context,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateSampler(
+ context(),
+ normalized_coords,
+ addressing_mode,
+ filter_mode,
+ &error);
+
+ detail::errHandler(error, __CREATE_SAMPLER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+ Sampler& operator = (const Sampler& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_sampler_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+ __GET_SAMPLER_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_sampler_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_sampler_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Sampler)
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+/*! \class NDRange
+ * \brief NDRange interface
+ */
+class NDRange
+{
+private:
+ size_t<3> sizes_;
+ cl_uint dimensions_;
+
+public:
+ NDRange()
+ : dimensions_(0)
+ { }
+
+ NDRange(::size_t size0)
+ : dimensions_(1)
+ {
+ sizes_.push_back(size0);
+ }
+
+ NDRange(::size_t size0, ::size_t size1)
+ : dimensions_(2)
+ {
+ sizes_.push_back(size0);
+ sizes_.push_back(size1);
+ }
+
+ NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+ : dimensions_(3)
+ {
+ sizes_.push_back(size0);
+ sizes_.push_back(size1);
+ sizes_.push_back(size2);
+ }
+
+ operator const ::size_t*() const { return (const ::size_t*) sizes_; }
+ ::size_t dimensions() const { return dimensions_; }
+};
+
+static const NDRange NullRange;
+
+/*!
+ * \struct LocalSpaceArg
+ * \brief Local address raper for use with Kernel::setArg
+ */
+struct LocalSpaceArg
+{
+ ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+ static ::size_t size(const T&) { return sizeof(T); }
+ static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+ static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+ static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+}
+//! \endcond
+
+inline LocalSpaceArg
+__local(::size_t size)
+{
+ LocalSpaceArg ret = { size };
+ return ret;
+}
+
+class KernelFunctor;
+
+/*! \class Kernel
+ * \brief Kernel interface that implements cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+ inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+ Kernel() { }
+
+ Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+ Kernel& operator = (const Kernel& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_kernel_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetKernelInfo, object_, name, param),
+ __GET_KERNEL_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_kernel_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_kernel_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int getWorkGroupInfo(
+ const Device& device, cl_kernel_work_group_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+ __GET_KERNEL_WORK_GROUP_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+ getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_kernel_work_group_info, name>::param_type param;
+ cl_int result = getWorkGroupInfo(device, name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int setArg(cl_uint index, T value)
+ {
+ return detail::errHandler(
+ ::clSetKernelArg(
+ object_,
+ index,
+ detail::KernelArgumentHandler<T>::size(value),
+ detail::KernelArgumentHandler<T>::ptr(value)),
+ __SET_KERNEL_ARGS_ERR);
+ }
+
+ cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+ {
+ return detail::errHandler(
+ ::clSetKernelArg(object_, index, size, argPtr),
+ __SET_KERNEL_ARGS_ERR);
+ }
+
+ KernelFunctor bind(
+ const CommandQueue& queue,
+ const NDRange& offset,
+ const NDRange& global,
+ const NDRange& local);
+
+ KernelFunctor bind(
+ const CommandQueue& queue,
+ const NDRange& global,
+ const NDRange& local);
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Kernel)
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+ typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+ typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+ Program(
+ const Context& context,
+ const Sources& sources,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ const ::size_t n = (::size_t)sources.size();
+ ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+ const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+ for (::size_t i = 0; i < n; ++i) {
+ strings[i] = sources[(int)i].first;
+ lengths[i] = sources[(int)i].second;
+ }
+
+ object_ = ::clCreateProgramWithSource(
+ context(), (cl_uint)n, strings, lengths, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Program(
+ const Context& context,
+ const VECTOR_CLASS<Device>& devices,
+ const Binaries& binaries,
+ VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ const ::size_t n = binaries.size();
+ if (n != devices.size())
+ detail::errHandler(CL_INVALID_BINARY,
+ "Number of binaries != Number of devices!");
+ ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+ const unsigned char** images = (const unsigned char**) alloca(n * sizeof(const void*));
+
+ for (::size_t i = 0; i < n; ++i) {
+ images[i] = (const unsigned char*)binaries[(int)i].first;
+ lengths[i] = binaries[(int)i].second;
+ }
+
+ object_ = ::clCreateProgramWithBinary(
+ context(), (cl_uint) devices.size(),
+ (cl_device_id*)&devices.front(),
+ lengths, images, binaryStatus != NULL
+ ? (cl_int*) &binaryStatus->front()
+ : NULL, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Program() { }
+
+ Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+ Program& operator = (const Program& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ cl_int build(
+ const VECTOR_CLASS<Device>& devices,
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL) const
+ {
+ return detail::errHandler(
+ ::clBuildProgram(
+ object_,
+ (cl_uint)
+ devices.size(),
+ (cl_device_id*)&devices.front(),
+ options,
+ notifyFptr,
+ data),
+ __BUILD_PROGRAM_ERR);
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_program_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetProgramInfo, object_, name, param),
+ __GET_PROGRAM_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_program_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_program_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int getBuildInfo(
+ const Device& device, cl_program_build_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetProgramBuildInfo, object_, device(), name, param),
+ __GET_PROGRAM_BUILD_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_program_build_info, name>::param_type
+ getBuildInfo(const Device& device, cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_program_build_info, name>::param_type param;
+ cl_int result = getBuildInfo(device, name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+ {
+ cl_uint numKernels;
+ cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+ }
+
+ Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+ err = ::clCreateKernelsInProgram(
+ object_, numKernels, (cl_kernel*) value, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+ }
+
+ kernels->assign(&value[0], &value[numKernels]);
+ return CL_SUCCESS;
+ }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::Program)
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+ cl_int error;
+
+ object_ = ::clCreateKernel(program(), name, &error);
+ detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+ if (err != NULL) {
+ *err = error;
+ }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+public:
+ CommandQueue(
+ const Context& context,
+ const Device& device,
+ cl_command_queue_properties properties = 0,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateCommandQueue(
+ context(), device(), properties, &error);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ CommandQueue() { }
+
+ CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+ CommandQueue& operator = (const CommandQueue& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_command_queue_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetCommandQueueInfo, object_, name, param),
+ __GET_COMMAND_QUEUE_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_command_queue_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_command_queue_info, name>::param_type param;
+ cl_int result = getInfo(name, &param);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int enqueueReadBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueReadBuffer(
+ object_, buffer(), blocking, offset, size,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_READ_BUFFER_ERR);
+ }
+
+ cl_int enqueueWriteBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ const void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueWriteBuffer(
+ object_, buffer(), blocking, offset, size,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_WRITE_BUFFER_ERR);
+ }
+
+ cl_int enqueueCopyBuffer(
+ const Buffer& src,
+ const Buffer& dst,
+ ::size_t src_offset,
+ ::size_t dst_offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueCopyBuffer(
+ object_, src(), dst(), src_offset, dst_offset, size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQEUE_COPY_BUFFER_ERR);
+ }
+
+#if defined(CL_VERSION_1_1)
+ cl_int enqueueReadBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueReadBufferRect(
+ object_,
+ buffer(),
+ blocking,
+ (const ::size_t *)buffer_offset,
+ (const ::size_t *)host_offset,
+ (const ::size_t *)region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_READ_BUFFER_RECT_ERR);
+ }
+
+
+ cl_int enqueueWriteBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueWriteBufferRect(
+ object_,
+ buffer(),
+ blocking,
+ (const ::size_t *)buffer_offset,
+ (const ::size_t *)host_offset,
+ (const ::size_t *)region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+ }
+
+ cl_int enqueueCopyBufferRect(
+ const Buffer& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ ::size_t src_row_pitch,
+ ::size_t src_slice_pitch,
+ ::size_t dst_row_pitch,
+ ::size_t dst_slice_pitch,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueCopyBufferRect(
+ object_,
+ src(),
+ dst(),
+ (const ::size_t *)src_origin,
+ (const ::size_t *)dst_origin,
+ (const ::size_t *)region,
+ src_row_pitch,
+ src_slice_pitch,
+ dst_row_pitch,
+ dst_slice_pitch,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQEUE_COPY_BUFFER_RECT_ERR);
+ }
+#endif
+
+ cl_int enqueueReadImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueReadImage(
+ object_, image(), blocking, (const ::size_t *) origin,
+ (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_READ_IMAGE_ERR);
+ }
+
+ cl_int enqueueWriteImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueWriteImage(
+ object_, image(), blocking, (const ::size_t *) origin,
+ (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_WRITE_IMAGE_ERR);
+ }
+
+ cl_int enqueueCopyImage(
+ const Image& src,
+ const Image& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueCopyImage(
+ object_, src(), dst(), (const ::size_t *) src_origin,
+ (const ::size_t *)dst_origin, (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_COPY_IMAGE_ERR);
+ }
+
+ cl_int enqueueCopyImageToBuffer(
+ const Image& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& region,
+ ::size_t dst_offset,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueCopyImageToBuffer(
+ object_, src(), dst(), (const ::size_t *) src_origin,
+ (const ::size_t *) region, dst_offset,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+ }
+
+ cl_int enqueueCopyBufferToImage(
+ const Buffer& src,
+ const Image& dst,
+ ::size_t src_offset,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueCopyBufferToImage(
+ object_, src(), dst(), src_offset,
+ (const ::size_t *) dst_origin, (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+ }
+
+ void* enqueueMapBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ cl_map_flags flags,
+ ::size_t offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL,
+ cl_int* err = NULL) const
+ {
+ cl_int error;
+ void * result = ::clEnqueueMapBuffer(
+ object_, buffer(), blocking, flags, offset, size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event,
+ &error);
+
+ detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return result;
+ }
+
+ void* enqueueMapImage(
+ const Image& buffer,
+ cl_bool blocking,
+ cl_map_flags flags,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t * row_pitch,
+ ::size_t * slice_pitch,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL,
+ cl_int* err = NULL) const
+ {
+ cl_int error;
+ void * result = ::clEnqueueMapImage(
+ object_, buffer(), blocking, flags,
+ (const ::size_t *) origin, (const ::size_t *) region,
+ row_pitch, slice_pitch,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event,
+ &error);
+
+ detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return result;
+ }
+
+ cl_int enqueueUnmapMemObject(
+ const Memory& memory,
+ void* mapped_ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueUnmapMemObject(
+ object_, memory(), mapped_ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+ }
+
+ cl_int enqueueNDRangeKernel(
+ const Kernel& kernel,
+ const NDRange& offset,
+ const NDRange& global,
+ const NDRange& local,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueNDRangeKernel(
+ object_, kernel(), (cl_uint) global.dimensions(),
+ offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+ (const ::size_t*) global,
+ local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_NDRANGE_KERNEL_ERR);
+ }
+
+ cl_int enqueueTask(
+ const Kernel& kernel,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueTask(
+ object_, kernel(),
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_TASK_ERR);
+ }
+
+ cl_int enqueueNativeKernel(
+ void (*userFptr)(void *),
+ std::pair<void*, ::size_t> args,
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<const void*>* mem_locs = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0)
+ ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+ : NULL;
+
+ if (mems != NULL) {
+ for (unsigned int i = 0; i < mem_objects->size(); i++) {
+ mems[i] = ((*mem_objects)[i])();
+ }
+ }
+
+ return detail::errHandler(
+ ::clEnqueueNativeKernel(
+ object_, userFptr, args.first, args.second,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ mems,
+ (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_NATIVE_KERNEL);
+ }
+
+ cl_int enqueueMarker(Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueMarker(object_, (cl_event*) event),
+ __ENQUEUE_MARKER_ERR);
+ }
+
+ cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const
+ {
+ return detail::errHandler(
+ ::clEnqueueWaitForEvents(
+ object_,
+ (cl_uint) events.size(),
+ (const cl_event*) &events.front()),
+ __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+ }
+
+ cl_int enqueueAcquireGLObjects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueAcquireGLObjects(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_ACQUIRE_GL_ERR);
+ }
+
+ cl_int enqueueReleaseGLObjects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ return detail::errHandler(
+ ::clEnqueueReleaseGLObjects(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_RELEASE_GL_ERR);
+ }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+ cl_command_queue command_queue, cl_uint num_objects,
+ const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+ cl_command_queue command_queue, cl_uint num_objects,
+ const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event);
+
+ cl_int enqueueAcquireD3D10Objects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+ __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+
+ return detail::errHandler(
+ pfn_clEnqueueAcquireD3D10ObjectsKHR(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_ACQUIRE_GL_ERR);
+ }
+
+ cl_int enqueueReleaseD3D10Objects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+ __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+
+ return detail::errHandler(
+ pfn_clEnqueueReleaseD3D10ObjectsKHR(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event),
+ __ENQUEUE_RELEASE_GL_ERR);
+ }
+#endif
+
+ cl_int enqueueBarrier() const
+ {
+ return detail::errHandler(
+ ::clEnqueueBarrier(object_),
+ __ENQUEUE_BARRIER_ERR);
+ }
+
+ cl_int flush() const
+ {
+ return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+ }
+
+ cl_int finish() const
+ {
+ return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+ }
+};
+
+__GET_INFO_HELPER_WITH_RETAIN(cl::CommandQueue)
+
+/*! \class KernelFunctor
+ * \brief Kernel functor interface
+ *
+ * \note Currently only functors of zero to ten arguments are supported. It
+ * is straightforward to add more and a more general solution, similar to
+ * Boost.Lambda could be followed if required in the future.
+ */
+class KernelFunctor
+{
+private:
+ Kernel kernel_;
+ CommandQueue queue_;
+ NDRange offset_;
+ NDRange global_;
+ NDRange local_;
+
+ cl_int err_;
+public:
+ KernelFunctor() { }
+
+ KernelFunctor(
+ const Kernel& kernel,
+ const CommandQueue& queue,
+ const NDRange& offset,
+ const NDRange& global,
+ const NDRange& local) :
+ kernel_(kernel),
+ queue_(queue),
+ offset_(offset),
+ global_(global),
+ local_(local),
+ err_(CL_SUCCESS)
+ {}
+
+ KernelFunctor& operator=(const KernelFunctor& rhs);
+
+ KernelFunctor(const KernelFunctor& rhs);
+
+ cl_int getError() { return err_; }
+
+ inline Event operator()(const VECTOR_CLASS<Event>* events = NULL);
+
+ template<typename A1>
+ inline Event operator()(
+ const A1& a1,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5, class A6>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4,
+ class A5, class A6, class A7>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13, class A14>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const A14& a14,
+ const VECTOR_CLASS<Event>* events = NULL);
+
+ template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13, class A14, class A15>
+ inline Event operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const A14& a14,
+ const A15& a15,
+ const VECTOR_CLASS<Event>* events = NULL);
+};
+
+inline KernelFunctor Kernel::bind(
+ const CommandQueue& queue,
+ const NDRange& offset,
+ const NDRange& global,
+ const NDRange& local)
+{
+ return KernelFunctor(*this,queue,offset,global,local);
+}
+
+inline KernelFunctor Kernel::bind(
+ const CommandQueue& queue,
+ const NDRange& global,
+ const NDRange& local)
+{
+ return KernelFunctor(*this,queue,NullRange,global,local);
+}
+
+inline KernelFunctor& KernelFunctor::operator=(const KernelFunctor& rhs)
+{
+ if (this == &rhs) {
+ return *this;
+ }
+
+ kernel_ = rhs.kernel_;
+ queue_ = rhs.queue_;
+ offset_ = rhs.offset_;
+ global_ = rhs.global_;
+ local_ = rhs.local_;
+
+ return *this;
+}
+
+inline KernelFunctor::KernelFunctor(const KernelFunctor& rhs) :
+ kernel_(rhs.kernel_),
+ queue_(rhs.queue_),
+ offset_(rhs.offset_),
+ global_(rhs.global_),
+ local_(rhs.local_)
+{
+}
+
+Event KernelFunctor::operator()(const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+ typename A6>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4,
+ typename A5, typename A6, typename A7>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+ typename A6, typename A7, typename A8>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+ typename A6, typename A7, typename A8, typename A9>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<typename A1, typename A2, typename A3, typename A4, typename A5,
+ typename A6, typename A7, typename A8, typename A9, typename A10>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+ kernel_.setArg(10,a11);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+ kernel_.setArg(10,a11);
+ kernel_.setArg(11,a12);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+ kernel_.setArg(10,a11);
+ kernel_.setArg(11,a12);
+ kernel_.setArg(12,a13);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13, class A14>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const A14& a14,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+ kernel_.setArg(10,a11);
+ kernel_.setArg(11,a12);
+ kernel_.setArg(12,a13);
+ kernel_.setArg(13,a14);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+template<class A1, class A2, class A3, class A4, class A5,
+ class A6, class A7, class A8, class A9, class A10,
+ class A11, class A12, class A13, class A14, class A15>
+Event KernelFunctor::operator()(
+ const A1& a1,
+ const A2& a2,
+ const A3& a3,
+ const A4& a4,
+ const A5& a5,
+ const A6& a6,
+ const A7& a7,
+ const A8& a8,
+ const A9& a9,
+ const A10& a10,
+ const A11& a11,
+ const A12& a12,
+ const A13& a13,
+ const A14& a14,
+ const A15& a15,
+ const VECTOR_CLASS<Event>* events)
+{
+ Event event;
+
+ kernel_.setArg(0,a1);
+ kernel_.setArg(1,a2);
+ kernel_.setArg(2,a3);
+ kernel_.setArg(3,a4);
+ kernel_.setArg(4,a5);
+ kernel_.setArg(5,a6);
+ kernel_.setArg(6,a7);
+ kernel_.setArg(7,a8);
+ kernel_.setArg(8,a9);
+ kernel_.setArg(9,a10);
+ kernel_.setArg(10,a11);
+ kernel_.setArg(11,a12);
+ kernel_.setArg(12,a13);
+ kernel_.setArg(13,a14);
+ kernel_.setArg(14,a15);
+
+ err_ = queue_.enqueueNDRangeKernel(
+ kernel_,
+ offset_,
+ global_,
+ local_,
+ NULL, // bgaster_fixme - do we want to allow wait event lists?
+ &event);
+
+ return event;
+}
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __GET_INFO_HELPER_WITH_RETAIN
+
+// Extensions
+#undef __INIT_CL_EXT_FCN_PTR
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
new file mode 100644
index 0000000..2efb934
--- /dev/null
+++ b/include/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+/******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D10_DEVICE_KHR -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
+
+// cl_d3d10_device_source_nv
+#define CL_D3D10_DEVICE_KHR 0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
+
+// cl_d3d10_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
+
+// cl_context_info
+#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+// cl_mem_info
+#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
+
+// cl_image_info
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Buffer * resource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __OPENCL_CL_D3D10_H
+
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
new file mode 100644
index 0000000..a572590
--- /dev/null
+++ b/include/CL/cl_ext.h
@@ -0,0 +1,228 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies. */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #include <AvailabilityMacros.h>
+#else
+ #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions */
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions */
+#define CL_DEVICE_HALF_FP_CONFIG 0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
+ void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 0
+
+/* cl_platform_info */
+#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
+
+/* Additional Error Codes */
+#define CL_PLATFORM_NOT_FOUND_KHR -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+ cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
+#define CL_DEVICE_WARP_SIZE_NV 0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
+
+/*********************************
+ * TI extensions
+*********************************/
+#define CL_DEVICE_MSMC_MEM_SIZE_TI 0x4060
+#define CL_DEVICE_GLOBAL_EXT1_MEM_SIZE_TI 0x4061
+#define CL_DEVICE_GLOBAL_EXT2_MEM_SIZE_TI 0x4062
+
+#define CL_DEVICE_GLOBAL_MEM_MAX_ALLOC_TI 0x4063
+#define CL_DEVICE_GLOBAL_EXT1_MEM_MAX_ALLOC_TI 0x4064
+#define CL_DEVICE_GLOBAL_EXT2_MEM_MAX_ALLOC_TI 0x4065
+#define CL_DEVICE_MSMC_MEM_MAX_ALLOC_TI 0x4066
+#define CL_DEVICE_LOCAL_MEM_MAX_ALLOC_TI 0x4067
+
+#define CL_MEM_USE_MSMC_TI (1 << 20)
+
+
+#ifdef CL_VERSION_1_1
+ /***********************************
+ * cl_ext_device_fission extension *
+ ***********************************/
+ #define cl_ext_device_fission 1
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef cl_ulong cl_device_partition_property_ext;
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clCreateSubDevicesEXT( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ /* cl_device_partition_property_ext */
+ #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
+ #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
+ #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
+ #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
+
+ /* clDeviceGetInfo selectors */
+ #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
+ #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
+ #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
+ #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
+ #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
+
+ /* error codes */
+ #define CL_DEVICE_PARTITION_FAILED_EXT -1057
+ #define CL_INVALID_PARTITION_COUNT_EXT -1058
+ #define CL_INVALID_PARTITION_NAME_EXT -1059
+
+ /* CL_AFFINITY_DOMAINs */
+ #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
+ #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
+ #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
+ #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
+ #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
+ #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
+
+ /* cl_device_partition_property_ext list terminators */
+ #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
+
+
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
new file mode 100644
index 0000000..e22fa7d
--- /dev/null
+++ b/include/CL/cl_gl.h
@@ -0,0 +1,155 @@
+/******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/*
+ * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
+ * OpenGL dependencies. The application is responsible for #including
+ * OpenGL or OpenGL ES headers before #including cl_gl.h.
+ */
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenGL/CGLDevice.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint cl_gl_object_type;
+typedef cl_uint cl_gl_texture_info;
+typedef cl_uint cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_BUFFER 0x2000
+#define CL_GL_OBJECT_TEXTURE2D 0x2001
+#define CL_GL_OBJECT_TEXTURE3D 0x2002
+#define CL_GL_OBJECT_RENDERBUFFER 0x2003
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET 0x2004
+#define CL_GL_MIPMAP_LEVEL 0x2005
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* bufobj */,
+ int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* renderbuffer */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem /* memobj */,
+ cl_gl_object_type * /* gl_object_type */,
+ cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem /* memobj */,
+ cl_gl_texture_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+/* cl_khr_gl_sharing extension */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint cl_gl_context_info;
+
+/* Additional Error Codes */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
+
+/* cl_gl_context_info */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
+
+/* Additional cl_context_properties */
+#define CL_GL_CONTEXT_KHR 0x2008
+#define CL_EGL_DISPLAY_KHR 0x2009
+#define CL_GLX_DISPLAY_KHR 0x200A
+#define CL_WGL_HDC_KHR 0x200B
+#define CL_CGL_SHAREGROUP_KHR 0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+ cl_gl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+ const cl_context_properties * properties,
+ cl_gl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_H */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000..2f1ef47
--- /dev/null
+++ b/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */
+/* OpenGL dependencies. */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl_gl.h>
+#else
+ #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ * // cl_VEN_extname extension
+ * #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ * This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/*
+ * cl_khr_gl_event extension
+ * See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context /* context */,
+ cl_GLsync /* cl_GLsync */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_EXT_H */
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
new file mode 100644
index 0000000..52b79f3
--- /dev/null
+++ b/include/CL/cl_platform.h
@@ -0,0 +1,1198 @@
+/******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+ #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+ #define CL_API_ENTRY
+ #define CL_API_CALL __stdcall
+ #define CL_CALLBACK __stdcall
+#else
+ #define CL_API_ENTRY
+ #define CL_API_CALL
+ #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
+ #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+#else
+ #define CL_EXTENSION_WEAK_LINK
+ #define CL_API_SUFFIX__VERSION_1_0
+ #define CL_EXT_SUFFIX__VERSION_1_0
+ #define CL_API_SUFFIX__VERSION_1_1
+ #define CL_EXT_SUFFIX__VERSION_1_1
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types */
+typedef signed __int8 cl_char;
+typedef unsigned __int8 cl_uchar;
+typedef signed __int16 cl_short;
+typedef unsigned __int16 cl_ushort;
+typedef signed __int32 cl_int;
+typedef unsigned __int32 cl_uint;
+typedef signed __int64 cl_long;
+typedef unsigned __int64 cl_ulong;
+
+typedef unsigned __int16 cl_half;
+typedef float cl_float;
+typedef double cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN 1.175494350822287507969e-38f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN 2.225073858507201383090e-308
+#define CL_DBL_EPSILON 2.220446049250313080847e-16
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#define CL_NAN (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF ((cl_float) 1e50)
+#define CL_HUGE_VAL ((cl_double) 1e500)
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types */
+typedef int8_t cl_char;
+typedef uint8_t cl_uchar;
+typedef int16_t cl_short __attribute__((aligned(2)));
+typedef uint16_t cl_ushort __attribute__((aligned(2)));
+typedef int32_t cl_int __attribute__((aligned(4)));
+typedef uint32_t cl_uint __attribute__((aligned(4)));
+typedef int64_t cl_long __attribute__((aligned(8)));
+typedef uint64_t cl_ulong __attribute__((aligned(8)));
+
+typedef uint16_t cl_half __attribute__((aligned(2)));
+typedef float cl_float __attribute__((aligned(4)));
+typedef double cl_double __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 0x1.fffffep127f
+#define CL_FLT_MIN 0x1.0p-126f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 0x1.fffffffffffffp1023
+#define CL_DBL_MIN 0x1.0p-1022
+#define CL_DBL_EPSILON 0x1.0p-52
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#if defined( __GNUC__ )
+ #define CL_HUGE_VALF __builtin_huge_valf()
+ #define CL_HUGE_VAL __builtin_huge_val()
+ #define CL_NAN __builtin_nanf( "" )
+#else
+ #define CL_HUGE_VALF ((cl_float) 1e50)
+ #define CL_HUGE_VAL ((cl_double) 1e500)
+ float nanf( const char * );
+ #define CL_NAN nanf( "" )
+#endif
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ * Note: OpenCL requires that all types be naturally aligned.
+ * This means that vector types must be naturally aligned.
+ * For example, a vector of four floats must be aligned to
+ * a 16 byte boundary (calculated as 4 * the natural 4-byte
+ * alignment of the float). The alignment qualifiers here
+ * will only function properly if your compiler supports them
+ * and if you don't actively work to defeat them. For example,
+ * in order for a cl_float4 to be 16 byte aligned in a struct,
+ * the start of the struct must itself be 16-byte aligned.
+ *
+ * Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+ #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+ typedef vector unsigned char __cl_uchar16;
+ typedef vector signed char __cl_char16;
+ typedef vector unsigned short __cl_ushort8;
+ typedef vector signed short __cl_short8;
+ typedef vector unsigned int __cl_uint4;
+ typedef vector signed int __cl_int4;
+ typedef vector float __cl_float4;
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_INT4__ 1
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <xmmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef float __cl_float4 __attribute__((vector_size(16)));
+ #else
+ typedef __m128 __cl_float4;
+ #endif
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE2__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <emmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
+ typedef cl_char __cl_char16 __attribute__((vector_size(16)));
+ typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
+ typedef cl_short __cl_short8 __attribute__((vector_size(16)));
+ typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
+ typedef cl_int __cl_int4 __attribute__((vector_size(16)));
+ typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
+ typedef cl_long __cl_long2 __attribute__((vector_size(16)));
+ typedef cl_double __cl_double2 __attribute__((vector_size(16)));
+ #else
+ typedef __m128i __cl_uchar16;
+ typedef __m128i __cl_char16;
+ typedef __m128i __cl_ushort8;
+ typedef __m128i __cl_short8;
+ typedef __m128i __cl_uint4;
+ typedef __m128i __cl_int4;
+ typedef __m128i __cl_ulong2;
+ typedef __m128i __cl_long2;
+ typedef __m128d __cl_double2;
+ #endif
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_INT4__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_ULONG2__ 1
+ #define __CL_LONG2__ 1
+ #define __CL_DOUBLE2__ 1
+#endif
+
+#if defined( __MMX__ )
+ #include <mmintrin.h>
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
+ typedef cl_char __cl_char8 __attribute__((vector_size(8)));
+ typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
+ typedef cl_short __cl_short4 __attribute__((vector_size(8)));
+ typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
+ typedef cl_int __cl_int2 __attribute__((vector_size(8)));
+ typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
+ typedef cl_long __cl_long1 __attribute__((vector_size(8)));
+ typedef cl_float __cl_float2 __attribute__((vector_size(8)));
+ #else
+ typedef __m64 __cl_uchar8;
+ typedef __m64 __cl_char8;
+ typedef __m64 __cl_ushort4;
+ typedef __m64 __cl_short4;
+ typedef __m64 __cl_uint2;
+ typedef __m64 __cl_int2;
+ typedef __m64 __cl_ulong1;
+ typedef __m64 __cl_long1;
+ typedef __m64 __cl_float2;
+ #endif
+ #define __CL_UCHAR8__ 1
+ #define __CL_CHAR8__ 1
+ #define __CL_USHORT4__ 1
+ #define __CL_SHORT4__ 1
+ #define __CL_INT2__ 1
+ #define __CL_UINT2__ 1
+ #define __CL_ULONG1__ 1
+ #define __CL_LONG1__ 1
+ #define __CL_FLOAT2__ 1
+#endif
+
+#if defined( __AVX__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <immintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_float __cl_float8 __attribute__((vector_size(32)));
+ typedef cl_double __cl_double4 __attribute__((vector_size(32)));
+ #else
+ typedef __m256 __cl_float8;
+ typedef __m256d __cl_double4;
+ #endif
+ #define __CL_FLOAT8__ 1
+ #define __CL_DOUBLE4__ 1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+ #define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+ /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
+ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
+ /* #include <crtdefs.h> */
+ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
+ #define CL_ALIGNED(_x)
+#else
+ #warning Need to implement some method to align data here
+ #define CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ /* .xyzw and .s0123...{f|F} are supported */
+ #define CL_HAS_NAMED_VECTOR_FIELDS 1
+ /* .hi and .lo are supported */
+ #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+ cl_char CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y; };
+ __extension__ struct{ cl_char s0, s1; };
+ __extension__ struct{ cl_char lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+ cl_char CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w; };
+ __extension__ struct{ cl_char s0, s1, s2, s3; };
+ __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef cl_char4 cl_char3;
+
+typedef union
+{
+ cl_char CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w; };
+ __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+ cl_char CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+ __cl_char16 v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+ cl_uchar CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y; };
+ __extension__ struct{ cl_uchar s0, s1; };
+ __extension__ struct{ cl_uchar lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+ __cl_uchar2 v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3; };
+ __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef cl_uchar4 cl_uchar3;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+ __cl_uchar16 v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+ cl_short CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y; };
+ __extension__ struct{ cl_short s0, s1; };
+ __extension__ struct{ cl_short lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+ cl_short CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w; };
+ __extension__ struct{ cl_short s0, s1, s2, s3; };
+ __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef cl_short4 cl_short3;
+
+typedef union
+{
+ cl_short CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w; };
+ __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+ cl_short CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+ __cl_short16 v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+ cl_ushort CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y; };
+ __extension__ struct{ cl_ushort s0, s1; };
+ __extension__ struct{ cl_ushort lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3; };
+ __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef cl_ushort4 cl_ushort3;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+ __cl_ushort16 v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+ cl_int CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y; };
+ __extension__ struct{ cl_int s0, s1; };
+ __extension__ struct{ cl_int lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+ cl_int CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w; };
+ __extension__ struct{ cl_int s0, s1, s2, s3; };
+ __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[2];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef cl_int4 cl_int3;
+
+typedef union
+{
+ cl_int CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w; };
+ __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[4];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[2];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+ cl_int CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[8];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[4];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8[2];
+#endif
+#if defined( __CL_INT16__ )
+ __cl_int16 v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+ cl_uint CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y; };
+ __extension__ struct{ cl_uint s0, s1; };
+ __extension__ struct{ cl_uint lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3; };
+ __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[2];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef cl_uint4 cl_uint3;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[4];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[8];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+ __cl_uint16 v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+ cl_long CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y; };
+ __extension__ struct{ cl_long s0, s1; };
+ __extension__ struct{ cl_long lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+ cl_long CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w; };
+ __extension__ struct{ cl_long s0, s1, s2, s3; };
+ __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[2];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef cl_long4 cl_long3;
+
+typedef union
+{
+ cl_long CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w; };
+ __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[4];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+ cl_long CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[8];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+ __cl_long16 v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+ cl_ulong CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y; };
+ __extension__ struct{ cl_ulong s0, s1; };
+ __extension__ struct{ cl_ulong lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3; };
+ __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef cl_ulong4 cl_ulong3;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+ __cl_ulong16 v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+ cl_float CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y; };
+ __extension__ struct{ cl_float s0, s1; };
+ __extension__ struct{ cl_float lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+ cl_float CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w; };
+ __extension__ struct{ cl_float s0, s1, s2, s3; };
+ __extension__ struct{ cl_float2 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef cl_float4 cl_float3;
+
+typedef union
+{
+ cl_float CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w; };
+ __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_float4 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+ cl_float CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+ __cl_float16 v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+ cl_double CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y; };
+ __extension__ struct{ cl_double s0, s1; };
+ __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+ cl_double CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w; };
+ __extension__ struct{ cl_double s0, s1, s2, s3; };
+ __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef cl_double4 cl_double3;
+
+typedef union
+{
+ cl_double CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w; };
+ __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
+ __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+ cl_double CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+ __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+ __cl_double16 v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ * The first line ends with: CL_PROGRAM_STRING_BEGIN \"
+ * Each line thereafter of OpenCL C source must end with: \n\
+ * The last line ends in ";
+ *
+ * Example:
+ *
+ * const char *my_program = CL_PROGRAM_STRING_BEGIN "\
+ * kernel void foo( int a, float * b ) \n\
+ * { \n\
+ * // my comment \n\
+ * *b[ get_global_id(0)] = a; \n\
+ * } \n\
+ * ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define __CL_STRINGIFY( _x ) # _x
+#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
+#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_PLATFORM_H */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
new file mode 100644
index 0000000..26a6389
--- /dev/null
+++ b/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_H */
+
diff --git a/include/clc.h b/include/clc.h
new file mode 100644
index 0000000..c6e6b5d
--- /dev/null
+++ b/include/clc.h
@@ -0,0 +1,1939 @@
+/*
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+ * Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CLC_H_
+#define _CLC_H_
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#define _CLC_PROTECTED __attribute__((visibility("protected")))
+#define _CLC_OVERLOAD __attribute__((overloadable))
+#define _CLC_DECL __attribute__((visibility("protected")))
+#define _CLC_DEF __attribute__((visibility("protected")))
+#define _CLC_INLINE __attribute__((always_inline)) inline
+
+#define UNARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x);\
+
+#define BINARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type##2 y); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type##3 y); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type##4 y); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type##8 y); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type##16 y);\
+
+#define BINARY_VEC_DECL_ALT(type,utype,type2,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type2##2 y); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type2##3 y); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type2##4 y); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type2##8 y); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type2##16 y);\
+
+#define TERNARY_VEC_DECL(type,utype,name) \
+_CLC_OVERLOAD _CLC_DECL utype##2 name(type##2 x, type##2 y, type##2 z); \
+_CLC_OVERLOAD _CLC_DECL utype##3 name(type##3 x, type##3 y, type##3 z); \
+_CLC_OVERLOAD _CLC_DECL utype##4 name(type##4 x, type##4 y, type##4 z); \
+_CLC_OVERLOAD _CLC_DECL utype##8 name(type##8 x, type##8 y, type##8 z); \
+_CLC_OVERLOAD _CLC_DECL utype##16 name(type##16 x, type##16 y,type##16 z);\
+
+#define UNARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x) { return op(x); }
+
+#define BINARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x, type y); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type y) { return op(x, y); }
+
+#define BINARY_INLINE_ALT(type,utype,type2,name,op) \
+_CLC_PROTECTED utype op(type x, type2 y); \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type2 y) { return op(x, y); }
+
+#define BINARY_INLINE_ALT2(type,utype,type2,name,op) \
+_CLC_OVERLOAD _CLC_INLINE utype name(type x, type2 y) { return op(x, (type)y); }
+
+#define TERNARY_INLINE(type,utype,name,op) \
+_CLC_PROTECTED utype op(type x, type y, type z); \
+ _CLC_OVERLOAD _CLC_INLINE utype name(type x, type y, type z) { return op(x, y, z); }
+
+#define UNARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x) \
+{ return (utype##2) (op(x.s0), op(x.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x) \
+{ return (utype##3) (op(x.s0), op(x.s1), op(x.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x) \
+{ return (utype##4) (op(x.s0), op(x.s1), op(x.s2), op(x.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x) \
+{ return (utype##8) (op(x.s0), op(x.s1), op(x.s2), op(x.s3),\
+ op(x.s4), op(x.s5), op(x.s6), op(x.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x) \
+{ return (utype##16) (op(x.s0), op(x.s1), op(x.s2), op(x.s3),\
+ op(x.s4), op(x.s5), op(x.s6), op(x.s7),\
+ op(x.s8), op(x.s9), op(x.sa), op(x.sb),\
+ op(x.sc), op(x.sd), op(x.se), op(x.sf)); }
+
+#define BINARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type##2 y) \
+{ return (utype##2) (op(x.s0,y.s0), op(x.s1,y.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type##3 y) \
+{ return (utype##3) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type##4 y) \
+{ return (utype##4) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type##8 y) \
+{ return (utype##8) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type##16 y) \
+{ return (utype##16) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7),\
+ op(x.s8,y.s8), op(x.s9,y.s9), op(x.sa,y.sa), op(x.sb,y.sb),\
+ op(x.sc,y.sc), op(x.sd,y.sd), op(x.se,y.se), op(x.sf,y.sf)); }
+
+#define BINARY_VEC_DEF_ALT(type,utype,type2,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type2##2 y) \
+{ return (utype##2) (op(x.s0,y.s0), op(x.s1,y.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type2##3 y) \
+{ return (utype##3) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type2##4 y) \
+{ return (utype##4) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type2##8 y) \
+{ return (utype##8) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type2##16 y) \
+{ return (utype##16) (op(x.s0,y.s0), op(x.s1,y.s1), op(x.s2,y.s2), op(x.s3,y.s3),\
+ op(x.s4,y.s4), op(x.s5,y.s5), op(x.s6,y.s6), op(x.s7,y.s7),\
+ op(x.s8,y.s8), op(x.s9,y.s9), op(x.sa,y.sa), op(x.sb,y.sb),\
+ op(x.sc,y.sc), op(x.sd,y.sd), op(x.se,y.se), op(x.sf,y.sf)); }
+
+#define TERNARY_VEC_DEF(type,utype,name,op)\
+_CLC_OVERLOAD _CLC_DEF utype##2 name(type##2 x, type##2 y, type##2 z) \
+{ return (utype##2) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1)); }\
+_CLC_OVERLOAD _CLC_DEF utype##3 name(type##3 x, type##3 y, type##3 z) \
+{ return (utype##3) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), op(x.s2,y.s2,z.s2)); }\
+_CLC_OVERLOAD _CLC_DEF utype##4 name(type##4 x, type##4 y, type##4 z) \
+{ return (utype##4) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3)); }\
+_CLC_OVERLOAD _CLC_DEF utype##8 name(type##8 x, type##8 y, type##8 z) \
+{ return (utype##8) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3),\
+ op(x.s4,y.s4,z.s4), op(x.s5,y.s5,z.s5), \
+ op(x.s6,y.s6,z.s6), op(x.s7,y.s7,z.s7)); }\
+_CLC_OVERLOAD _CLC_DEF utype##16 name(type##16 x, type##16 y, type##16 z) \
+{ return (utype##16) (op(x.s0,y.s0,z.s0), op(x.s1,y.s1,z.s1), \
+ op(x.s2,y.s2,z.s2), op(x.s3,y.s3,z.s3),\
+ op(x.s4,y.s4,z.s4), op(x.s5,y.s5,z.s5), \
+ op(x.s6,y.s6,z.s6), op(x.s7,y.s7,z.s7),\
+ op(x.s8,y.s8,z.s8), op(x.s9,y.s9,z.s9), \
+ op(x.sa,y.sa,z.sa), op(x.sb,y.sb,z.sb),\
+ op(x.sc,y.sc,z.sc), op(x.sd,y.sd,z.sd), \
+ op(x.se,y.se,z.se), op(x.sf,y.sf,z.sf)); }
+
+
+#define _VEC_TYPE(type,sz) type##sz
+
+#define _EXPAND_TYPES() \
+ EXPAND_SIZES(char) \
+ EXPAND_SIZES(uchar) \
+ EXPAND_SIZES(short) \
+ EXPAND_SIZES(ushort) \
+ EXPAND_SIZES(int) \
+ EXPAND_SIZES(uint) \
+ EXPAND_SIZES(long) \
+ EXPAND_SIZES(ulong) \
+ EXPAND_SIZES(float) \
+ EXPAND_SIZES(double)
+
+#define _EXPAND_INTEGER_TYPES() \
+ EXPAND_SIZES(char) \
+ EXPAND_SIZES(uchar) \
+ EXPAND_SIZES(short) \
+ EXPAND_SIZES(ushort) \
+ EXPAND_SIZES(int) \
+ EXPAND_SIZES(uint) \
+ EXPAND_SIZES(long) \
+ EXPAND_SIZES(ulong)
+
+typedef unsigned int cl_mem_fence_flags;
+
+/*-----------------------------------------------------------------------------
+* Standard types from Clang's stddef and stdint, Copyright (C) 2008 Eli Friedman
+*----------------------------------------------------------------------------*/
+typedef __INT64_TYPE__ int64_t;
+typedef __UINT64_TYPE__ uint64_t;
+typedef __INT32_TYPE__ int32_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __UINT16_TYPE__ uint16_t;
+typedef __INT8_TYPE__ int8_t;
+typedef __UINT8_TYPE__ uint8_t;
+
+#define __stdint_join3(a,b,c) a ## b ## c
+#define __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __intn_t(__INTPTR_WIDTH__) intptr_t;
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+
+#undef __stdint_join3
+#undef __intn_t
+#undef __uintn_t
+
+/*-----------------------------------------------------------------------------
+* OpenCL types
+*----------------------------------------------------------------------------*/
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+typedef uint64_t ulong;
+
+#if defined(CLANG_OLDER_THAN_3_3)
+typedef unsigned int sampler_t;
+typedef struct image2d *image2d_t;
+typedef struct image3d *image3d_t;
+#endif
+
+/*-----------------------------------------------------------------------------
+* Vectors
+*----------------------------------------------------------------------------*/
+#define COAL_VECTOR(type, len) \
+ typedef type type##len __attribute__((ext_vector_type(len)))
+
+#define COAL_VECTOR_SET(type) \
+ COAL_VECTOR(type, 2); \
+ COAL_VECTOR(type, 3); \
+ COAL_VECTOR(type, 4); \
+ COAL_VECTOR(type, 8); \
+ COAL_VECTOR(type, 16);
+
+COAL_VECTOR_SET(char)
+COAL_VECTOR_SET(uchar)
+COAL_VECTOR_SET(short)
+COAL_VECTOR_SET(ushort)
+COAL_VECTOR_SET(int)
+COAL_VECTOR_SET(uint)
+COAL_VECTOR_SET(long)
+COAL_VECTOR_SET(ulong)
+COAL_VECTOR_SET(float)
+COAL_VECTOR_SET(double)
+
+#undef COAL_VECTOR_SET
+#undef COAL_VECTOR
+
+#define CL_VERSION_1_0 100
+#define CL_VERSION_1_1 110
+#define __OPENCL_VERSION__ 110
+#define __ENDIAN_LITTLE__ 1
+#define __kernel_exec(X, typen) __kernel __attribute__((work_group_size_hint(X, 1, 1))) \
+ __attribute__((vec_type_hint(typen)))
+#define kernel_exec __kernel_exec
+
+#define __write_only
+#define __read_only const
+
+#define write_only __write_only
+#define read_only __read_only
+
+#define CLK_NORMALIZED_COORDS_FALSE 0x00000000
+#define CLK_NORMALIZED_COORDS_TRUE 0x00000001
+#define CLK_ADDRESS_NONE 0x00000000
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010
+#define CLK_ADDRESS_REPEAT 0x00000020
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x00000030
+#define CLK_ADDRESS_CLAMP 0x00000040
+#define CLK_FILTER_NEAREST 0x00000000
+#define CLK_FILTER_LINEAR 0x00000100
+#define CLK_LOCAL_MEM_FENCE 0x00000001
+#define CLK_GLOBAL_MEM_FENCE 0x00000002
+#define CLK_R 0x10B0
+#define CLK_A 0x10B1
+#define CLK_RG 0x10B2
+#define CLK_RA 0x10B3
+#define CLK_RGB 0x10B4
+#define CLK_RGBA 0x10B5
+#define CLK_BGRA 0x10B6
+#define CLK_ARGB 0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx 0x10BA
+#define CLK_RGx 0x10BB
+#define CLK_RGBx 0x10BC
+#define CLK_SNORM_INT8 0x10D0
+#define CLK_SNORM_INT16 0x10D1
+#define CLK_UNORM_INT8 0x10D2
+#define CLK_UNORM_INT16 0x10D3
+#define CLK_UNORM_SHORT_565 0x10D4
+#define CLK_UNORM_SHORT_555 0x10D5
+#define CLK_UNORM_INT_101010 0x10D6
+#define CLK_SIGNED_INT8 0x10D7
+#define CLK_SIGNED_INT16 0x10D8
+#define CLK_SIGNED_INT32 0x10D9
+#define CLK_UNSIGNED_INT8 0x10DA
+#define CLK_UNSIGNED_INT16 0x10DB
+#define CLK_UNSIGNED_INT32 0x10DC
+#define CLK_HALF_FLOAT 0x10DD
+#define CLK_FLOAT 0x10DE
+
+_CLC_PROTECTED void barrier (cl_mem_fence_flags flags);
+_CLC_PROTECTED void mem_fence (cl_mem_fence_flags flags);
+_CLC_PROTECTED void read_mem_fence (cl_mem_fence_flags flags);
+_CLC_PROTECTED void write_mem_fence (cl_mem_fence_flags flags);
+
+/******************************************************************************
+* AS_<type> functions
+******************************************************************************/
+#define as_char(x) __builtin_astype(x, char)
+#define as_uchar(x) __builtin_astype(x, uchar)
+#define as_short(x) __builtin_astype(x, short)
+#define as_ushort(x) __builtin_astype(x, ushort)
+#define as_int(x) __builtin_astype(x, int)
+#define as_uint(x) __builtin_astype(x, uint)
+#define as_long(x) __builtin_astype(x, long)
+#define as_ulong(x) __builtin_astype(x, ulong)
+#define as_float(x) __builtin_astype(x, float)
+#define as_double(x) __builtin_astype(x, double)
+
+#define as_char2(x) __builtin_astype(x, char2)
+#define as_uchar2(x) __builtin_astype(x, uchar2)
+#define as_short2(x) __builtin_astype(x, short2)
+#define as_ushort2(x) __builtin_astype(x, ushort2)
+#define as_int2(x) __builtin_astype(x, int2)
+#define as_uint2(x) __builtin_astype(x, uint2)
+#define as_long2(x) __builtin_astype(x, long2)
+#define as_ulong2(x) __builtin_astype(x, ulong2)
+#define as_float2(x) __builtin_astype(x, float2)
+#define as_double2(x) __builtin_astype(x, double2)
+
+#define as_char3(x) __builtin_astype(x, char3)
+#define as_uchar3(x) __builtin_astype(x, uchar3)
+#define as_short3(x) __builtin_astype(x, short3)
+#define as_ushort3(x) __builtin_astype(x, ushort3)
+#define as_int3(x) __builtin_astype(x, int3)
+#define as_uint3(x) __builtin_astype(x, uint3)
+#define as_long3(x) __builtin_astype(x, long3)
+#define as_ulong3(x) __builtin_astype(x, ulong3)
+#define as_float3(x) __builtin_astype(x, float3)
+#define as_double3(x) __builtin_astype(x, double3)
+
+#define as_char4(x) __builtin_astype(x, char4)
+#define as_uchar4(x) __builtin_astype(x, uchar4)
+#define as_short4(x) __builtin_astype(x, short4)
+#define as_ushort4(x) __builtin_astype(x, ushort4)
+#define as_int4(x) __builtin_astype(x, int4)
+#define as_uint4(x) __builtin_astype(x, uint4)
+#define as_long4(x) __builtin_astype(x, long4)
+#define as_ulong4(x) __builtin_astype(x, ulong4)
+#define as_float4(x) __builtin_astype(x, float4)
+#define as_double4(x) __builtin_astype(x, double4)
+
+#define as_char8(x) __builtin_astype(x, char8)
+#define as_uchar8(x) __builtin_astype(x, uchar8)
+#define as_short8(x) __builtin_astype(x, short8)
+#define as_ushort8(x) __builtin_astype(x, ushort8)
+#define as_int8(x) __builtin_astype(x, int8)
+#define as_uint8(x) __builtin_astype(x, uint8)
+#define as_long8(x) __builtin_astype(x, long8)
+#define as_ulong8(x) __builtin_astype(x, ulong8)
+#define as_float8(x) __builtin_astype(x, float8)
+#define as_double8(x) __builtin_astype(x, double8)
+
+#define as_char16(x) __builtin_astype(x, char16)
+#define as_uchar16(x) __builtin_astype(x, uchar16)
+#define as_short16(x) __builtin_astype(x, short16)
+#define as_ushort16(x) __builtin_astype(x, ushort16)
+#define as_int16(x) __builtin_astype(x, int16)
+#define as_uint16(x) __builtin_astype(x, uint16)
+#define as_long16(x) __builtin_astype(x, long16)
+#define as_ulong16(x) __builtin_astype(x, ulong16)
+#define as_float16(x) __builtin_astype(x, float16)
+#define as_double16(x) __builtin_astype(x, double16)
+
+#define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_OVERLOAD _CLC_DECL TO_TYPE convert_##TO_TYPE##SUFFIX(FROM_TYPE x);
+
+#define _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##2, TO_TYPE##2, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##3, TO_TYPE##3, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##4, TO_TYPE##4, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##8, TO_TYPE##8, SUFFIX) \
+ _CLC_CONVERT_DECL(FROM_TYPE##16, TO_TYPE##16, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, char, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uchar, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, int, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, uint, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, short, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ushort, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, long, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, ulong, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, float, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_FROM(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM1(FROM_TYPE, SUFFIX) \
+ _CLC_VECTOR_CONVERT_DECL(FROM_TYPE, double, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(char, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(uchar, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(int, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(uint, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(short, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(ushort, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(long, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(ulong, SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(float, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO(SUFFIX) \
+ _CLC_VECTOR_CONVERT_TO1(SUFFIX) \
+ _CLC_VECTOR_CONVERT_FROM(double, SUFFIX)
+
+#define _CLC_VECTOR_CONVERT_TO_SUFFIX(ROUND) \
+ _CLC_VECTOR_CONVERT_TO(_sat##ROUND) \
+ _CLC_VECTOR_CONVERT_TO(ROUND)
+
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtn)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rte)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtz)
+_CLC_VECTOR_CONVERT_TO_SUFFIX(_rtp)
+_CLC_VECTOR_CONVERT_TO_SUFFIX()
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) \
+ { return (PRIM_TYPE##2)(x[offset<<1] , x[1+(offset<<1)]); } \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x); \
+ _CLC_OVERLOAD _CLC_DECL PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+ VLOAD_ADDR_SPACES(double)\
+
+VLOAD_TYPES()
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) \
+ { mem[offset<<1] = vec.s0; mem[1+(offset<<1)] = vec.s1; } \
+ _CLC_OVERLOAD _CLC_DECL void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+ _CLC_OVERLOAD _CLC_DECL void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem); \
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+ VSTORE_ADDR_SPACES(char) \
+ VSTORE_ADDR_SPACES(uchar) \
+ VSTORE_ADDR_SPACES(short) \
+ VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
+ VSTORE_ADDR_SPACES(long) \
+ VSTORE_ADDR_SPACES(ulong) \
+ VSTORE_ADDR_SPACES(float) \
+ VSTORE_ADDR_SPACES(double) \
+
+VSTORE_TYPES()
+
+#undef VLOAD_VECTORIZE
+#undef VLOAD_ADDR_SPACES
+#undef VLOAD_TYPES
+#undef VSTORE_VECTORIZE
+#undef VSTORE_ADDR_SPACES
+#undef VSTORE_TYPES
+
+/*-----------------------------------------------------------------------------
+* Relational
+*----------------------------------------------------------------------------*/
+#define INLN(type) \
+_CLC_OVERLOAD _CLC_INLINE type bitselect(type a, type b, type c) { return a^(c&(b^a)); }
+
+#define DECL(type) \
+_CLC_OVERLOAD _CLC_DECL type bitselect(type a, type b, type c);
+
+INLN(char)
+INLN(uchar)
+INLN(short)
+INLN(ushort)
+INLN(int)
+INLN(uint)
+INLN(long)
+INLN(ulong)
+
+DECL(char2)
+DECL(uchar2)
+INLN(short2)
+INLN(ushort2)
+INLN(int2)
+INLN(uint2)
+DECL(long2)
+DECL(ulong2)
+
+DECL(char3)
+DECL(uchar3)
+DECL(short3)
+DECL(ushort3)
+DECL(int3)
+DECL(uint3)
+DECL(long3)
+DECL(ulong3)
+
+INLN(char4)
+INLN(uchar4)
+INLN(short4)
+INLN(ushort4)
+DECL(int4)
+DECL(uint4)
+DECL(long4)
+DECL(ulong4)
+
+INLN(char8)
+INLN(uchar8)
+DECL(short8)
+DECL(ushort8)
+DECL(int8)
+DECL(uint8)
+DECL(long8)
+DECL(ulong8)
+
+DECL(char16)
+DECL(uchar16)
+DECL(short16)
+DECL(ushort16)
+DECL(int16)
+DECL(uint16)
+DECL(long16)
+DECL(ulong16)
+
+DECL(float)
+DECL(float2)
+DECL(float3)
+DECL(float4)
+DECL(float8)
+DECL(float16)
+
+DECL(double)
+DECL(double2)
+DECL(double3)
+DECL(double4)
+DECL(double8)
+DECL(double16)
+
+#undef INLN
+#undef DECL
+
+#define EXTU(x,l,r) (((x) << l) >> r)
+
+#define SIGND(x) (as_uint2(x).hi >> 31)
+#define EXPD(x) EXTU(as_uint2(x).hi, 1, 21)
+#define MANTD_HI(x) EXTU(as_uint2(x).hi, 12, 12)
+#define MANTD_LO(x) as_uint2(x).lo
+#define MANTD_ZERO(x) (MANTD_HI(x) == 0 && MANTD_LO(x) == 0)
+#define ANY_ZEROD(x) ((as_ulong(x) << 1) == 0)
+#define SUBNORMD(x) (EXPD(x) == 0 && !MANTD_ZERO(x))
+
+#define FABSF(x) ((as_uint(x) << 1) >> 1)
+#define SIGNF(x) (as_uint(x) >> 31)
+#define EXPF(x) ((as_uint(x) << 1) >> 24)
+#define MANTF(x) ((as_uint(x) << 9) >> 9)
+
+#define isordered(x,y) (!isnan(x) & !isnan(y))
+#define isunordered(x,y) (isnan(x) | isnan(y))
+
+_CLC_OVERLOAD _CLC_INLINE int isnan(float x) { return FABSF(x) > 0x7F800000; }
+UNARY_INLINE (double, int, isnan, __builtin_isnan)
+UNARY_VEC_DECL(float, int, isnan)
+UNARY_VEC_DECL(double, long, isnan)
+
+_CLC_OVERLOAD _CLC_INLINE int isfinite(float x) { return EXPF(x) != 255; }
+UNARY_INLINE (double, int, isfinite, __builtin_isfinite)
+UNARY_VEC_DECL(float, int, isfinite)
+UNARY_VEC_DECL(double, long, isfinite)
+
+_CLC_OVERLOAD _CLC_INLINE int isinf(float x) { return FABSF(x) == 0x7F800000; }
+UNARY_INLINE (double, int, isinf, __builtin_isinf)
+UNARY_VEC_DECL(float, int, isinf)
+UNARY_VEC_DECL(double, long, isinf)
+
+_CLC_OVERLOAD _CLC_INLINE int isnormal(float x) { return EXPF(x) != 0 && EXPF(x) != 255; }
+UNARY_INLINE (double, int, isnormal, __builtin_isnormal)
+UNARY_VEC_DECL(float, int, isnormal)
+UNARY_VEC_DECL(double, long, isnormal)
+
+_CLC_OVERLOAD _CLC_INLINE int signbit(float x) { return SIGNF(x); }
+_CLC_OVERLOAD _CLC_INLINE int signbit(double x) { return SIGND(x); }
+UNARY_VEC_DECL(float, int, signbit)
+UNARY_VEC_DECL(double, long, signbit)
+
+_CLC_OVERLOAD _CLC_INLINE float copysign(float x, float y)
+ { return as_float(FABSF(x) | (SIGNF(y) << 31)); }
+
+_CLC_OVERLOAD _CLC_INLINE double copysign(double x, double y)
+{ return as_double(((as_ulong(x) << 1) >> 1) | ((as_ulong(y) >> 63) << 63)); }
+
+BINARY_VEC_DECL(float, float, copysign)
+BINARY_VEC_DECL(double, double, copysign)
+
+_CLC_OVERLOAD _CLC_INLINE int isequal(float x, float y) { return x == y; }
+_CLC_OVERLOAD _CLC_INLINE int isequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x == y;
+
+}
+BINARY_VEC_DECL(float, int, isequal)
+BINARY_VEC_DECL(double, long, isequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isnotequal(float x, float y) { return x != y; }
+_CLC_OVERLOAD _CLC_INLINE int isnotequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 1;
+ else return x != y;
+
+}
+BINARY_VEC_DECL(float, int, isnotequal)
+BINARY_VEC_DECL(double, long, isnotequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isless(float x, float y) { return x < y; }
+_CLC_OVERLOAD _CLC_INLINE int isless(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x < y;
+}
+BINARY_VEC_DECL(float, int, isless)
+BINARY_VEC_DECL(double, long, isless)
+
+_CLC_OVERLOAD _CLC_INLINE int islessequal(float x, float y) { return x <= y; }
+_CLC_OVERLOAD _CLC_INLINE int islessequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x <= y;
+}
+BINARY_VEC_DECL(float, int, islessequal)
+BINARY_VEC_DECL(double, long, islessequal)
+
+_CLC_OVERLOAD _CLC_INLINE int isgreater(float x, float y) { return x > y; }
+_CLC_OVERLOAD _CLC_INLINE int isgreater(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x > y;
+}
+BINARY_VEC_DECL(float, int, isgreater)
+BINARY_VEC_DECL(double, long, isgreater)
+
+_CLC_OVERLOAD _CLC_INLINE int isgreaterequal(float x, float y) { return x >= y; }
+_CLC_OVERLOAD _CLC_INLINE int isgreaterequal(double x, double y)
+{
+ if (isnan(x) || isnan(y)) return 0;
+ else return x >= y;
+}
+BINARY_VEC_DECL(float, int, isgreaterequal)
+BINARY_VEC_DECL(double, long, isgreaterequal)
+
+_CLC_OVERLOAD _CLC_INLINE int islessgreater(float x, float y)
+{ return isless(x,y) | isgreater(x, y); }
+_CLC_OVERLOAD _CLC_INLINE int islessgreater(double x, double y)
+{ return isless(x,y) | isgreater(x, y); }
+BINARY_VEC_DECL(float, int, islessgreater)
+BINARY_VEC_DECL(double, long, islessgreater)
+
+#undef EXPD
+#undef MANTD_HI
+#undef MANTD_LO
+#undef MANTD_ZERO
+#undef SIGND
+#undef FABSF
+#undef SIGNF
+#undef EXPF
+#undef MANTF
+#undef EXTU
+
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_INLINE int any(type x) { return x < 0; } \
+_CLC_OVERLOAD _CLC_INLINE int any(type##2 x) { return (x.s0 | x.s1) < 0; } \
+_CLC_OVERLOAD _CLC_DECL int any(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL int any(type##16 x); \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
+
+#undef TEMPLATE
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_INLINE int all(type x) { return x < 0; } \
+_CLC_OVERLOAD _CLC_INLINE int all(type##2 x) { return (x.s0 & x.s1) < 0; } \
+_CLC_OVERLOAD _CLC_DECL int all(type##3 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##4 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##8 x); \
+_CLC_OVERLOAD _CLC_DECL int all(type##16 x); \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
+
+#undef TEMPLATE
+
+#define DEFINE(type, otype) \
+_CLC_OVERLOAD _CLC_INLINE type select(type a, type b, otype c) { return c ? b : a; }
+
+DEFINE(char, char)
+DEFINE(char, uchar)
+DEFINE(uchar, char)
+DEFINE(uchar, uchar)
+DEFINE(short, short)
+DEFINE(short, ushort)
+DEFINE(ushort, short)
+DEFINE(ushort, ushort)
+DEFINE(int, int)
+DEFINE(int, uint)
+DEFINE(uint, int)
+DEFINE(uint, uint)
+DEFINE(long, long)
+DEFINE(long, ulong)
+DEFINE(ulong, long)
+DEFINE(ulong, ulong)
+DEFINE(float, int)
+DEFINE(float, uint)
+DEFINE(double, long)
+DEFINE(double, ulong)
+
+#undef DEFINE
+
+#define DECLARATION(type, itype, utype) \
+_CLC_OVERLOAD _CLC_DECL type select(type a, type b, itype c);\
+_CLC_OVERLOAD _CLC_DECL type select(type a, type b, utype c);
+
+#define SELECT_EXPAND_SIZES(type,itype,utype) \
+ DECLARATION(_VEC_TYPE(type,2), _VEC_TYPE(itype,2), _VEC_TYPE(utype,2)) \
+ DECLARATION(_VEC_TYPE(type,3), _VEC_TYPE(itype,3), _VEC_TYPE(utype,3)) \
+ DECLARATION(_VEC_TYPE(type,4), _VEC_TYPE(itype,4), _VEC_TYPE(utype,4)) \
+ DECLARATION(_VEC_TYPE(type,8), _VEC_TYPE(itype,8), _VEC_TYPE(utype,8)) \
+ DECLARATION(_VEC_TYPE(type,16), _VEC_TYPE(itype,16), _VEC_TYPE(utype,16)) \
+
+#define SELECT_EXPAND_TYPES \
+ SELECT_EXPAND_SIZES(char, char, uchar) \
+ SELECT_EXPAND_SIZES(uchar, char, uchar) \
+ SELECT_EXPAND_SIZES(short, short, ushort) \
+ SELECT_EXPAND_SIZES(ushort, short, ushort) \
+ SELECT_EXPAND_SIZES(int, int, uint) \
+ SELECT_EXPAND_SIZES(uint, int, uint) \
+ SELECT_EXPAND_SIZES(long, long, ulong) \
+ SELECT_EXPAND_SIZES(ulong, long, ulong) \
+ SELECT_EXPAND_SIZES(float, int, uint) \
+ SELECT_EXPAND_SIZES(double, long, ulong)
+
+SELECT_EXPAND_TYPES
+
+#undef DECLARATION
+#undef SELECT_EXPAND_SIZES
+#undef SELECT_EXPAND_TYPES
+
+/*-----------------------------------------------------------------------------
+* Math
+*----------------------------------------------------------------------------*/
+#define CHAR_BIT 8
+#define CHAR_MAX SCHAR_MAX
+#define CHAR_MIN SCHAR_MIN
+#define INT_MAX 2147483647
+#define INT_MIN (-2147483647 - 1)
+#define LONG_MAX 0x7fffffffffffffffL
+#define LONG_MIN (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-127 - 1)
+#define SHRT_MAX 32767
+#define SHRT_MIN (-32767 - 1)
+#define UCHAR_MAX 255
+#define USHRT_MAX 65535
+#define UINT_MAX 0xffffffff
+#define ULONG_MAX 0xffffffffffffffffUL
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E 2.7182818284590452354 /* e */
+#define M_LOG2E 1.4426950408889634074 /* log_2 e */
+#define M_LOG10E 0.43429448190325182765 /* log_10 e */
+#define M_LN2 0.69314718055994530942 /* log_e 2 */
+#define M_LN10 2.30258509299404568402 /* log_e 10 */
+#define M_PI 3.14159265358979323846 /* pi */
+#define M_PI_2 1.57079632679489661923 /* pi/2 */
+#define M_PI_4 0.78539816339744830962 /* pi/4 */
+#define M_1_PI 0.31830988618379067154 /* 1/pi */
+#define M_2_PI 0.63661977236758134308 /* 2/pi */
+#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
+#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
+#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
+
+#define M_E_F M_E
+#define M_LOG2E_F M_LOG2E
+#define M_LOG10E_F M_LOG10E
+#define M_LN2_F M_LN2
+#define M_LN10_F M_LN10
+#define M_PI_F M_PI
+#define M_PI_2_F M_PI_2
+#define M_PI_4_F M_PI_4
+#define M_1_PI_F M_1_PI
+#define M_2_PI_F M_2_PI
+#define M_2_SQRTPI_F M_2_SQRTPI
+#define M_SQRT2_F M_SQRT2
+#define M_SQRT1_2_F M_SQRT1_2
+
+#define MAXFLOAT FLT_MAX
+#define HUGE_VALF __builtin_huge_valf()
+#define INFINITY (1.0f / 0.0f)
+#define NAN (0.0f / 0.0f)
+
+#define HUGE_VAL __builtin_huge_val()
+
+#define FP_ILOGB0 (-INT_MAX)
+#define FP_ILOGBNAN (INT_MAX)
+
+#define UNARY(function) \
+UNARY_INLINE (float, float, function, function##f)\
+UNARY_INLINE (double, double, function, function##d)\
+UNARY_VEC_DECL(float, float, function)\
+UNARY_VEC_DECL(double, double, function)\
+
+#define UNARYT(type1, type2, function,op) \
+UNARY_INLINE (type1, type2, function, op)\
+UNARY_VEC_DECL(type1, type2, function)\
+
+#define UNARYT_ALT(type1, type2, function, op) \
+UNARY_INLINE (type1, type2, function, op) \
+UNARY_VEC_DECL(type1, type2, function) \
+
+#define BINARY(function) \
+BINARY_INLINE (float, float, function, function##f)\
+BINARY_INLINE (double, double, function, function##d)\
+BINARY_VEC_DECL(float, float, function)\
+BINARY_VEC_DECL(double, double, function)\
+
+#define TERNARY(function) \
+TERNARY_INLINE (float, float, function, function##f)\
+TERNARY_INLINE (double, double, function, function##d)\
+TERNARY_VEC_DECL(float, float, function)\
+TERNARY_VEC_DECL(double, double, function)\
+
+/*-------------------------------------------------------------------------
+* Prototypes for the math builtins
+*------------------------------------------------------------------------*/
+UNARY(acos)
+UNARY(acosh)
+
+_CLC_OVERLOAD _CLC_INLINE float acospi(float x) { return acosf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double acospi(double x) { return acosd(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, acospi)
+UNARY_VEC_DECL(double, double, acospi)
+
+UNARY(asin)
+UNARY(asinh)
+
+_CLC_OVERLOAD _CLC_INLINE float asinpi(float x) { return asinf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double asinpi(double x) { return asind(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, asinpi)
+UNARY_VEC_DECL(double, double, asinpi)
+
+UNARY(atan)
+UNARY(atanh)
+
+_CLC_OVERLOAD _CLC_INLINE float atanpi(float x) { return atanf(x) * M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double atanpi(double x) { return atand(x) * M_1_PI; }
+UNARY_VEC_DECL(float, float, atanpi)
+UNARY_VEC_DECL(double, double, atanpi)
+
+BINARY(atan2)
+
+_CLC_OVERLOAD _CLC_INLINE float atan2pi(float y, float x)
+ { return atan2f(y,x) * (float) M_1_PI; }
+_CLC_OVERLOAD _CLC_INLINE double atan2pi(double y, double x)
+ { return atan2d(y,x) * M_1_PI; }
+BINARY_VEC_DECL(float, float, atan2pi)
+BINARY_VEC_DECL(double, double, atan2pi)
+
+UNARY(cbrt)
+UNARY(ceil)
+
+UNARY(cos)
+UNARY(cosh)
+
+_CLC_OVERLOAD _CLC_INLINE float cospi(float x) { return cosf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double cospi(double x) { return cosd(x) * M_PI; }
+UNARY_VEC_DECL(float, float, cospi)
+UNARY_VEC_DECL(double, double, cospi)
+
+UNARY(erf)
+UNARY(erfc)
+UNARY(exp)
+UNARY(exp2)
+
+UNARYT(float, float, exp10, builtin_exp10f)
+UNARYT(double, double, exp10, builtin_exp10)
+
+UNARY(expm1)
+
+UNARY_INLINE (float, float, fabs, _fabsf)\
+UNARY_INLINE (double, double, fabs, _fabs)\
+UNARY_VEC_DECL(float, float, fabs)
+UNARY_VEC_DECL(double, double, fabs)
+
+BINARY(fdim)
+UNARY(floor)
+
+TERNARY(fma)
+
+BINARY(fmax)
+BINARY(fmin)
+BINARY(fmod)
+BINARY(hypot)
+
+UNARYT_ALT(float, int, ilogb, ilogbf)
+UNARYT_ALT(double, int, ilogb, ilogbd)
+
+BINARY_INLINE_ALT (float, float, int, ldexp, ldexpf)
+BINARY_INLINE_ALT (double, double, int, ldexp, __builtin_ldexp)
+BINARY_VEC_DECL_ALT(float, float, int, ldexp)
+BINARY_VEC_DECL_ALT(double, double, int, ldexp)
+
+UNARY(lgamma)
+UNARY(log)
+UNARY(log2)
+UNARY(log10)
+UNARY(log1p)
+UNARY(logb)
+
+_CLC_OVERLOAD _CLC_INLINE float mad(float a, float b, float c) { return (a*b)+c; }
+_CLC_OVERLOAD _CLC_INLINE double mad(double a, double b, double c) { return (a*b)+c; }
+TERNARY_VEC_DECL(float, float, mad)
+TERNARY_VEC_DECL(double, double, mad)
+
+_CLC_OVERLOAD _CLC_INLINE float maxmag(float x, float y) { return fmax(fabs(x), fabs(y)); }
+_CLC_OVERLOAD _CLC_INLINE double maxmag(double x, double y) { return fmax(fabs(x), fabs(y)); }
+BINARY_VEC_DECL(float, float, maxmag)
+BINARY_VEC_DECL(double, double, maxmag)
+
+_CLC_OVERLOAD _CLC_INLINE float minmag(float x, float y) { return fmin(fabs(x), fabs(y)); }
+_CLC_OVERLOAD _CLC_INLINE double minmag(double x, double y) { return fmin(fabs(x), fabs(y)); }
+BINARY_VEC_DECL(float, float, minmag)
+BINARY_VEC_DECL(double, double, minmag)
+
+_CLC_OVERLOAD _CLC_INLINE float nan(uint nancode)
+ { return as_float(0x7FC00000 | nancode); }
+_CLC_OVERLOAD _CLC_INLINE double nan(ulong nancode)
+ { return as_double(0x7FF8000000000000ul | nancode); }
+UNARY_VEC_DECL(uint, float, nan)
+UNARY_VEC_DECL(ulong, double, nan)
+
+BINARY(nextafter)
+BINARY(pow)
+
+_CLC_PROTECTED double builtin_pow(double x, double y);
+
+BINARY_INLINE_ALT2 (float, float, int, pown, powf)
+BINARY_INLINE_ALT2 (double, double, int, pown, builtin_pow)
+BINARY_VEC_DECL_ALT(float, float, int, pown)
+BINARY_VEC_DECL_ALT(double, double, int, pown)
+
+_CLC_OVERLOAD _CLC_INLINE float powr(float x, float y) { return powf(x,y); }
+_CLC_OVERLOAD _CLC_INLINE double powr(double x, double y) { return pow(x,y); }
+BINARY_VEC_DECL(float, float, powr)
+BINARY_VEC_DECL(double, double, powr)
+
+BINARY(remainder)
+UNARY(rint)
+
+#define builtin_rootnf(a,b) (builtin_pow(a, 1.0f / (float) b))
+#define builtin_rootn(a,b) (builtin_pow(a, 1.0 / (double)b))
+
+BINARY_INLINE_ALT2 (float, float, int, rootn, builtin_rootnf)
+BINARY_INLINE_ALT2 (double, double, int, rootn, builtin_rootn)
+BINARY_VEC_DECL_ALT(float, float, int, rootn)
+BINARY_VEC_DECL_ALT(double, double, int, rootn)
+
+UNARY(round)
+UNARY(sqrt)
+
+_CLC_OVERLOAD _CLC_INLINE float rsqrt(float x) { return 1.0f/sqrtf(x); }
+_CLC_OVERLOAD _CLC_INLINE double rsqrt(double x) { return 1.0/sqrt(x); }
+UNARY_VEC_DECL(float, float, rsqrt)
+UNARY_VEC_DECL(double, double, rsqrt)
+
+UNARY(sin)
+UNARY(sinh)
+
+_CLC_OVERLOAD _CLC_INLINE float sinpi(float x) { return sinf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double sinpi(double x) { return sind(x) * M_PI; }
+UNARY_VEC_DECL(float, float, sinpi)
+UNARY_VEC_DECL(double, double, sinpi)
+
+UNARY(tan)
+UNARY(tanh)
+UNARY(trunc)
+
+_CLC_OVERLOAD _CLC_INLINE float tanpi(float x) { return tanf(x) * M_PI; }
+_CLC_OVERLOAD _CLC_INLINE double tanpi(double x) { return tand(x) * M_PI; }
+UNARY_VEC_DECL(float, float, tanpi)
+UNARY_VEC_DECL(double, double, tanpi)
+
+UNARY(tgamma)
+
+/*-----------------------------------------------------------------------------
+* Native versions
+*----------------------------------------------------------------------------*/
+#define native_sin(x) sin(x)
+#define native_cos(x) cos(x)
+#define native_tan(x) tan(x)
+#define native_powr(x,y) powr(x,y)
+#define native_exp(x) exp(x)
+#define native_exp2(x) exp2(x)
+#define native_exp10(x) exp10(x)
+#define native_log2(x) log2(x)
+#define native_log10(x) log10(x)
+
+_CLC_OVERLOAD _CLC_INLINE float native_divide(float x, float y) { return x/y; }
+_CLC_OVERLOAD _CLC_INLINE double native_divide(double x, double y) { return x/y; }
+BINARY_VEC_DECL(float, float, native_divide)
+BINARY_VEC_DECL(double, double, native_divide)
+
+_CLC_OVERLOAD _CLC_INLINE float native_recip(float x) { return (float)1/x; }
+_CLC_OVERLOAD _CLC_INLINE double native_recip(double x) { return (double)1/x; }
+UNARY_VEC_DECL(float, float, native_recip)
+UNARY_VEC_DECL(double, double, native_recip)
+
+#define native_rsqrt(x) rsqrt(x)
+#define native_sqrt(x) sqrt(x)
+
+/*-----------------------------------------------------------------------------
+* Half versions
+*----------------------------------------------------------------------------*/
+#define half_sin(x) sin(x)
+#define half_cos(x) cos(x)
+#define half_tan(x) tan(x)
+#define half_powr(x,y) powr(x,y)
+#define half_exp(x) exp(x)
+#define half_exp2(x) exp2(x)
+#define half_exp10(x) exp10(x)
+#define half_log(x) log(x)
+#define half_log2(x) log2(x)
+#define half_log10(x) log10(x)
+
+_CLC_OVERLOAD _CLC_INLINE float half_divide(float x, float y) { return x/y; }
+_CLC_OVERLOAD _CLC_INLINE double half_divide(double x, double y) { return x/y; }
+BINARY_VEC_DECL(float, float, half_divide)
+BINARY_VEC_DECL(double, double, half_divide)
+
+_CLC_OVERLOAD _CLC_INLINE float half_recip(float x) { return (float)1/x; }
+_CLC_OVERLOAD _CLC_INLINE double half_recip(double x) { return (double)1/x; }
+UNARY_VEC_DECL(float, float, half_recip)
+UNARY_VEC_DECL(double, double, half_recip)
+
+#define half_rsqrt(x) rsqrt(x)
+#define half_sqrt(x) sqrt(x)
+
+#undef UNARY
+#undef UNARTY
+#undef UNARTY_ALT
+#undef BINARY
+#undef TERNARY
+
+/*-----------------------------------------------------------------------------
+* Functions requiring change of pointer to address spaces
+*----------------------------------------------------------------------------*/
+#define SCALAR_BODY(type, op, ptr_type) \
+{ \
+ ptr_type power; \
+ type result = op(x, &power); \
+ *ptr = power; \
+ return result; \
+} \
+
+#define VECTOR_BODY_2(op, ptr_type) \
+ temp.s0 = op(x.s0, &(((ptr_type*)&itemp)[0])); \
+ temp.s1 = op(x.s1, &(((ptr_type*)&itemp)[1])); \
+
+#define VECTOR_BODY_3(op, ptr_type) \
+ VECTOR_BODY_2(op, ptr_type) \
+ temp.s2 = op(x.s2, &(((ptr_type*)&itemp)[2])); \
+
+#define VECTOR_BODY_4(op, ptr_type) \
+ VECTOR_BODY_3(op, ptr_type) \
+ temp.s3 = op(x.s3, &(((ptr_type*)&itemp)[3])); \
+
+#define VECTOR_BODY_8(op, ptr_type) \
+ VECTOR_BODY_4(op, ptr_type) \
+ temp.s4 = op(x.s4, &(((ptr_type*)&itemp)[4])); \
+ temp.s5 = op(x.s5, &(((ptr_type*)&itemp)[5])); \
+ temp.s6 = op(x.s6, &(((ptr_type*)&itemp)[6])); \
+ temp.s7 = op(x.s7, &(((ptr_type*)&itemp)[7])); \
+
+#define VECTOR_BODY_16(op, ptr_type) \
+ VECTOR_BODY_8(op, ptr_type) \
+ temp.s8 = op(x.s8, &(((ptr_type*)&itemp)[8])); \
+ temp.s9 = op(x.s9, &(((ptr_type*)&itemp)[9])); \
+ temp.sa = op(x.sa, &(((ptr_type*)&itemp)[10])); \
+ temp.sb = op(x.sb, &(((ptr_type*)&itemp)[11])); \
+ temp.sc = op(x.sc, &(((ptr_type*)&itemp)[12])); \
+ temp.sd = op(x.sd, &(((ptr_type*)&itemp)[13])); \
+ temp.se = op(x.se, &(((ptr_type*)&itemp)[14])); \
+ temp.sf = op(x.sf, &(((ptr_type*)&itemp)[15])); \
+
+#define VECTOR_BODY(prim_type, num, op, ptr_type) \
+{ \
+ prim_type##num temp; \
+ ptr_type##num itemp; \
+ VECTOR_BODY_##num(op, ptr_type)\
+ *ptr = itemp; \
+ return temp; \
+} \
+
+_CLC_OVERLOAD _CLC_DECL float modf(float x, global float * iptr);
+_CLC_OVERLOAD _CLC_DECL float modf(float x, local float * iptr);
+_CLC_OVERLOAD _CLC_DECL float modf(float x, private float * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, global float2 * iptr);
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, local float2 * iptr);
+_CLC_OVERLOAD _CLC_DECL float2 modf(float2 x, private float2 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, global float3 * iptr);
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, local float3 * iptr);
+_CLC_OVERLOAD _CLC_DECL float3 modf(float3 x, private float3 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, global float4 * iptr);
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, local float4 * iptr);
+_CLC_OVERLOAD _CLC_DECL float4 modf(float4 x, private float4 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, global float8 * iptr);
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, local float8 * iptr);
+_CLC_OVERLOAD _CLC_DECL float8 modf(float8 x, private float8 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, global float16 * iptr);
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, local float16 * iptr);
+_CLC_OVERLOAD _CLC_DECL float16 modf(float16 x, private float16 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double modf(double x, global double * iptr);
+_CLC_OVERLOAD _CLC_DECL double modf(double x, local double * iptr);
+_CLC_OVERLOAD _CLC_DECL double modf(double x, private double * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, global double2 * iptr);
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, local double2 * iptr);
+_CLC_OVERLOAD _CLC_DECL double2 modf(double2 x, private double2 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, global double3 * iptr);
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, local double3 * iptr);
+_CLC_OVERLOAD _CLC_DECL double3 modf(double3 x, private double3 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, global double4 * iptr);
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, local double4 * iptr);
+_CLC_OVERLOAD _CLC_DECL double4 modf(double4 x, private double4 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, global double8 * iptr);
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, local double8 * iptr);
+_CLC_OVERLOAD _CLC_DECL double8 modf(double8 x, private double8 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, global double16 * iptr);
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, local double16 * iptr);
+_CLC_OVERLOAD _CLC_DECL double16 modf(double16 x, private double16 * iptr);
+
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL float frexp(float x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 frexp(float2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 frexp(float3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 frexp(float4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 frexp(float8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 frexp(float16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL double frexp(double x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 frexp(double2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 frexp(double3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 frexp(double4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 frexp(double8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 frexp(double16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL float lgamma_r(float x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 lgamma_r(float2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 lgamma_r(float3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 lgamma_r(float4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 lgamma_r(float8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 lgamma_r(float16 x, private int16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, global int * ptr);
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, local int * ptr);
+_CLC_OVERLOAD _CLC_DECL double lgamma_r(double x, private int * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, global int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, local int2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 lgamma_r(double2 x, private int2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, global int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, local int3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 lgamma_r(double3 x, private int3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, global int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, local int4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 lgamma_r(double4 x, private int4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, global int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, local int8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 lgamma_r(double8 x, private int8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, global int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, local int16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 lgamma_r(double16 x, private int16 * ptr);
+
+
+_CLC_OVERLOAD _CLC_DECL float fract(float x, global float * ptr);
+_CLC_OVERLOAD _CLC_DECL float fract(float x, local float * ptr);
+_CLC_OVERLOAD _CLC_DECL float fract(float x, private float * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, global float2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, local float2 * ptr);
+_CLC_OVERLOAD _CLC_DECL float2 fract(float2 x, private float2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, global float3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, local float3 * ptr);
+_CLC_OVERLOAD _CLC_DECL float3 fract(float3 x, private float3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, global float4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, local float4 * ptr);
+_CLC_OVERLOAD _CLC_DECL float4 fract(float4 x, private float4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, global float8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, local float8 * ptr);
+_CLC_OVERLOAD _CLC_DECL float8 fract(float8 x, private float8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, global float16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, local float16 * ptr);
+_CLC_OVERLOAD _CLC_DECL float16 fract(float16 x, private float16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double fract(double x, global double * ptr);
+_CLC_OVERLOAD _CLC_DECL double fract(double x, local double * ptr);
+_CLC_OVERLOAD _CLC_DECL double fract(double x, private double * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, global double2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, local double2 * ptr);
+_CLC_OVERLOAD _CLC_DECL double2 fract(double2 x, private double2 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, global double3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, local double3 * ptr);
+_CLC_OVERLOAD _CLC_DECL double3 fract(double3 x, private double3 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, global double4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, local double4 * ptr);
+_CLC_OVERLOAD _CLC_DECL double4 fract(double4 x, private double4 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, global double8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, local double8 * ptr);
+_CLC_OVERLOAD _CLC_DECL double8 fract(double8 x, private double8 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, global double16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, local double16 * ptr);
+_CLC_OVERLOAD _CLC_DECL double16 fract(double16 x, private double16 * ptr);
+
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, global int * quo);
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, local int * quo);
+_CLC_OVERLOAD _CLC_DECL float remquo(float x, float y, private int * quo);
+
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, global int2 * quo);
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, local int2 * quo);
+_CLC_OVERLOAD _CLC_DECL float2 remquo(float2 x, float2 y, private int2 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, global int3 * quo);
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, local int3 * quo);
+_CLC_OVERLOAD _CLC_DECL float3 remquo(float3 x, float3 y, private int3 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, global int4 * quo);
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, local int4 * quo);
+_CLC_OVERLOAD _CLC_DECL float4 remquo(float4 x, float4 y, private int4 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, global int8 * quo);
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, local int8 * quo);
+_CLC_OVERLOAD _CLC_DECL float8 remquo(float8 x, float8 y, private int8 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, global int16 * quo);
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, local int16 * quo);
+_CLC_OVERLOAD _CLC_DECL float16 remquo(float16 x, float16 y, private int16 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, global int * quo);
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, local int * quo);
+_CLC_OVERLOAD _CLC_DECL double remquo(double x, double y, private int * quo);
+
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, global int2 * quo);
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, local int2 * quo);
+_CLC_OVERLOAD _CLC_DECL double2 remquo(double2 x, double2 y, private int2 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, global int3 * quo);
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, local int3 * quo);
+_CLC_OVERLOAD _CLC_DECL double3 remquo(double3 x, double3 y, private int3 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, global int4 * quo);
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, local int4 * quo);
+_CLC_OVERLOAD _CLC_DECL double4 remquo(double4 x, double4 y, private int4 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, global int8 * quo);
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, local int8 * quo);
+_CLC_OVERLOAD _CLC_DECL double8 remquo(double8 x, double8 y, private int8 * quo);
+
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, global int16 * quo);
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, local int16 * quo);
+_CLC_OVERLOAD _CLC_DECL double16 remquo(double16 x, double16 y, private int16 * quo);
+
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, global float * cosval);
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, local float * cosval);
+_CLC_OVERLOAD _CLC_DECL float sincos(float x, private float * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, global float2 * cosval);
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, local float2 * cosval);
+_CLC_OVERLOAD _CLC_DECL float2 sincos(float2 x, private float2 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, global float3 * cosval);
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, local float3 * cosval);
+_CLC_OVERLOAD _CLC_DECL float3 sincos(float3 x, private float3 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, global float4 * cosval);
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, local float4 * cosval);
+_CLC_OVERLOAD _CLC_DECL float4 sincos(float4 x, private float4 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, global float8 * cosval);
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, local float8 * cosval);
+_CLC_OVERLOAD _CLC_DECL float8 sincos(float8 x, private float8 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, global float16 * cosval);
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, local float16 * cosval);
+_CLC_OVERLOAD _CLC_DECL float16 sincos(float16 x, private float16 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, global double * cosval);
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, local double * cosval);
+_CLC_OVERLOAD _CLC_DECL double sincos(double x, private double * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, global double2 * cosval);
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, local double2 * cosval);
+_CLC_OVERLOAD _CLC_DECL double2 sincos(double2 x, private double2 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, global double3 * cosval);
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, local double3 * cosval);
+_CLC_OVERLOAD _CLC_DECL double3 sincos(double3 x, private double3 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, global double4 * cosval);
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, local double4 * cosval);
+_CLC_OVERLOAD _CLC_DECL double4 sincos(double4 x, private double4 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, global double8 * cosval);
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, local double8 * cosval);
+_CLC_OVERLOAD _CLC_DECL double8 sincos(double8 x, private double8 * cosval);
+
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, global double16 * cosval);
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, local double16 * cosval);
+_CLC_OVERLOAD _CLC_DECL double16 sincos(double16 x, private double16 * cosval);
+
+/*-----------------------------------------------------------------------------
+* Integer
+*----------------------------------------------------------------------------*/
+#define EXPAND_SIZES(type) \
+ SCALAR(type) \
+ TEMPLATE(_VEC_TYPE(type,2)) \
+ TEMPLATE(_VEC_TYPE(type,3)) \
+ TEMPLATE(_VEC_TYPE(type,4)) \
+ TEMPLATE(_VEC_TYPE(type,8)) \
+ TEMPLATE(_VEC_TYPE(type,16)) \
+
+#define TEMPLATE(gentype) \
+ _CLC_OVERLOAD _CLC_DECL gentype hadd(gentype x1, gentype x2);\
+ _CLC_OVERLOAD _CLC_DECL gentype rhadd(gentype x1, gentype x2);\
+
+#define SCALAR(gentype) \
+ _CLC_OVERLOAD _CLC_INLINE gentype hadd(gentype x, gentype y) \
+ { return (x >> 1) + (y >> 1) + (x & y & 1); } \
+ _CLC_OVERLOAD _CLC_INLINE gentype rhadd(gentype x, gentype y) \
+ { return (x >> 1) + (y >> 1) + ((x&1)|(y&1)); } \
+
+_EXPAND_INTEGER_TYPES()
+
+#undef EXPAND_SIZES
+#undef SCALAR
+#undef TEMPLATE
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ DECLARATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype clamp(gentype x, gentype minval, gentype maxval); \
+_CLC_OVERLOAD _CLC_DECL gentype clamp(gentype x, sgentype minval, sgentype maxval); \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype clamp(gentype x, gentype minval, gentype maxval) \
+ { return x > maxval ? maxval : x < minval ? minval : x; } \
+
+_EXPAND_TYPES()
+
+#undef EXPAND_SIZES
+#undef IMPLEMENTATION
+#undef DECLARATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype min(gentype x, gentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype min(gentype x, sgentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype max(gentype x, gentype y); \
+_CLC_OVERLOAD _CLC_DECL gentype max(gentype x, sgentype y); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, sgentype y) \
+ { return (gentype)y < x ? (gentype)y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, sgentype y) \
+ { return (gentype)y > x ? (gentype)y : x; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_INLINE gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+
+_EXPAND_TYPES()
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype mix(gentype x, gentype y, gentype a); \
+_CLC_OVERLOAD _CLC_DECL gentype mix(gentype x, gentype y, sgentype a); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, sgentype a) \
+ { return x + (y-x) * (gentype)a; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+#define EXPAND_SIZES(type, utype) \
+ TEMPLATE(_VEC_TYPE(type,2), _VEC_TYPE(utype,2)) \
+ TEMPLATE(_VEC_TYPE(type,3), _VEC_TYPE(utype,3)) \
+ TEMPLATE(_VEC_TYPE(type,4), _VEC_TYPE(utype,4)) \
+ TEMPLATE(_VEC_TYPE(type,8), _VEC_TYPE(utype,8)) \
+ TEMPLATE(_VEC_TYPE(type,16), _VEC_TYPE(utype,16)) \
+
+#define TEMPLATE(gentype, ugentype) \
+ _CLC_OVERLOAD _CLC_DECL ugentype abs_diff(gentype x, gentype y);\
+
+EXPAND_SIZES(char, uchar)
+EXPAND_SIZES(uchar, uchar)
+EXPAND_SIZES(short, ushort)
+EXPAND_SIZES(ushort, ushort)
+EXPAND_SIZES(int, uint)
+EXPAND_SIZES(uint, uint)
+EXPAND_SIZES(long, ulong)
+EXPAND_SIZES(ulong, ulong)
+
+_CLC_OVERLOAD _CLC_INLINE uchar abs_diff (char x, char y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE uchar abs_diff (uchar x, uchar y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ushort abs_diff (short x, short y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ushort abs_diff (ushort x, ushort y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE uint abs_diff (uint x, uint y) { return x>y ? x-y : y-x; }
+_CLC_OVERLOAD _CLC_INLINE ulong abs_diff (ulong x, ulong y) { return x>y ? x-y : y-x; }
+
+_CLC_OVERLOAD _CLC_DECL uint abs_diff(int x, int y);
+_CLC_OVERLOAD _CLC_DECL ulong abs_diff(long x, long y);
+
+#undef EXPAND_SIZES
+#undef TEMPLATE
+
+#define mad_hi(a, b, c) (mul_hi((a),(b))+(c))
+#define mul24(a, b) ((a)*(b))
+#define mad24(a, b, c) (((a)*(b))+(c))
+
+/*-----------------------------------------------------------------------------
+* Common
+*----------------------------------------------------------------------------*/
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ DECLARATION(_VEC_TYPE(type,3)) \
+ DECLARATION(_VEC_TYPE(type,4)) \
+ DECLARATION(_VEC_TYPE(type,8)) \
+ DECLARATION(_VEC_TYPE(type,16)) \
+
+#define DECLARATION(gentype) \
+_CLC_OVERLOAD _CLC_DECL gentype degrees(gentype radians); \
+_CLC_OVERLOAD _CLC_DECL gentype radians(gentype degrees); \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype degrees(gentype radians) { return radians * (gentype)180.0 * (gentype)M_1_PI; } \
+_CLC_OVERLOAD _CLC_INLINE gentype radians(gentype degrees) { return degrees * (gentype)M_PI / (gentype)180.0; }
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+
+#define EXPAND_SIZES(type) \
+ SCALAR_IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2), type) \
+ DECLARATION(_VEC_TYPE(type,3), type) \
+ DECLARATION(_VEC_TYPE(type,4), type) \
+ DECLARATION(_VEC_TYPE(type,8), type) \
+ DECLARATION(_VEC_TYPE(type,16), type) \
+
+#define DECLARATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DECL gentype step(gentype edge, gentype x); \
+_CLC_OVERLOAD _CLC_DECL gentype step(sgentype edge, gentype x); \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype step(gentype edge, gentype x) \
+ { return x < edge ? (gentype)0.0 : (gentype)1.0 ; } \
+_CLC_OVERLOAD _CLC_INLINE gentype step(sgentype edge, gentype x) \
+ { return x < (gentype)edge ? (gentype)0.0 : (gentype)1.0 ; } \
+
+#define SCALAR_IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype step(gentype edge, gentype x) \
+ { return x < edge ? 0.0 : 1.0 ; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+#undef SCALAR_IMPLEMENTATION
+
+_CLC_OVERLOAD _CLC_DECL float smoothstep(float edge0, float edge1, float x);
+_CLC_OVERLOAD _CLC_DECL float2 smoothstep(float2 edge0, float2 edge1,
+ float2 x);
+_CLC_OVERLOAD _CLC_DECL float3 smoothstep(float3 edge0, float3 edge1,
+ float3 x);
+_CLC_OVERLOAD _CLC_DECL float4 smoothstep(float4 edge0, float4 edge1,
+ float4 x);
+_CLC_OVERLOAD _CLC_DECL float8 smoothstep(float8 edge0, float8 edge1,
+ float8 x);
+_CLC_OVERLOAD _CLC_DECL float16 smoothstep(float16 edge0, float16 edge1,
+ float16 x);
+
+_CLC_OVERLOAD _CLC_DECL float2 smoothstep(float edge0, float edge1, float2 x);
+_CLC_OVERLOAD _CLC_DECL float3 smoothstep(float edge0, float edge1, float3 x);
+_CLC_OVERLOAD _CLC_DECL float4 smoothstep(float edge0, float edge1, float4 x);
+_CLC_OVERLOAD _CLC_DECL float8 smoothstep(float edge0, float edge1, float8 x);
+_CLC_OVERLOAD _CLC_DECL float16 smoothstep(float edge0, float edge1, float16 x);
+
+_CLC_OVERLOAD _CLC_DECL double smoothstep(double edge0, double edge1, double x);
+_CLC_OVERLOAD _CLC_DECL double2 smoothstep(double2 edge0, double2 edge1,
+ double2 x);
+_CLC_OVERLOAD _CLC_DECL double3 smoothstep(double3 edge0, double3 edge1,
+ double3 x);
+_CLC_OVERLOAD _CLC_DECL double4 smoothstep(double4 edge0, double4 edge1,
+ double4 x);
+_CLC_OVERLOAD _CLC_DECL double8 smoothstep(double8 edge0, double8 edge1,
+ double8 x);
+_CLC_OVERLOAD _CLC_DECL double16 smoothstep(double16 edge0, double16 edge1,
+ double16 x);
+
+_CLC_OVERLOAD _CLC_DECL double2 smoothstep(double edge0, double edge1,
+ double2 x);
+_CLC_OVERLOAD _CLC_DECL double3 smoothstep(double edge0, double edge1,
+ double3 x);
+_CLC_OVERLOAD _CLC_DECL double4 smoothstep(double edge0, double edge1,
+ double4 x);
+_CLC_OVERLOAD _CLC_DECL double8 smoothstep(double edge0, double edge1,
+ double8 x);
+_CLC_OVERLOAD _CLC_DECL double16 smoothstep(double edge0, double edge1,
+ double16 x);
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ DECLARATION(_VEC_TYPE(type,3)) \
+ DECLARATION(_VEC_TYPE(type,4)) \
+ DECLARATION(_VEC_TYPE(type,8)) \
+ DECLARATION(_VEC_TYPE(type,16)) \
+
+#define DECLARATION(gentype) \
+_CLC_OVERLOAD _CLC_DECL gentype sign(gentype x); \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_INLINE gentype sign(gentype x) \
+{ return x > (gentype)0.0 ? (gentype) 1.0 : \
+ x < (gentype)0.0 ? (gentype)-1.0 : \
+ isnan(x) ? (gentype) 0.0 : x; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
+
+#undef EXPAND_SIZES
+#undef DECLARATION
+#undef IMPLEMENTATION
+
+/*-----------------------------------------------------------------------------
+* Geometric
+*----------------------------------------------------------------------------*/
+_CLC_OVERLOAD _CLC_INLINE float dot(float p0, float p1) {return p0*p1;}
+_CLC_OVERLOAD _CLC_INLINE float dot(float2 p0, float2 p1) {return p0.x*p1.x+p0.y*p1.y;}
+_CLC_OVERLOAD _CLC_DECL float dot(float3 p0, float3 p1);
+_CLC_OVERLOAD _CLC_DECL float dot(float4 p0, float4 p1);
+_CLC_OVERLOAD _CLC_INLINE double dot(double p0, double p1) {return p0*p1;}
+_CLC_OVERLOAD _CLC_INLINE double dot(double2 p0, double2 p1) {return p0.x*p1.x+p0.y*p1.y;}
+_CLC_OVERLOAD _CLC_DECL double dot(double3 p0, double3 p1) ;
+_CLC_OVERLOAD _CLC_DECL double dot(double4 p0, double4 p1) ;
+
+_CLC_OVERLOAD _CLC_DECL float3 cross(float3 p0, float3 p1);
+_CLC_OVERLOAD _CLC_DECL float4 cross(float4 p0, float4 p1);
+_CLC_OVERLOAD _CLC_DECL double3 cross(double3 p0, double3 p1);
+_CLC_OVERLOAD _CLC_DECL double4 cross(double4 p0, double4 p1);
+
+_CLC_OVERLOAD _CLC_INLINE float length(float p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE double length(double p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE float fast_length(float p) {return fabs(p);}
+_CLC_OVERLOAD _CLC_INLINE double fast_length(double p) {return fabs(p);}
+
+_CLC_OVERLOAD _CLC_DECL float length(float2 p);
+_CLC_OVERLOAD _CLC_DECL float length(float3 p);
+_CLC_OVERLOAD _CLC_DECL float length(float4 p);
+_CLC_OVERLOAD _CLC_DECL double length(double2 p);
+_CLC_OVERLOAD _CLC_DECL double length(double3 p);
+_CLC_OVERLOAD _CLC_DECL double length(double4 p);
+
+_CLC_OVERLOAD _CLC_DECL float fast_length(float2 p);
+_CLC_OVERLOAD _CLC_DECL float fast_length(float3 p);
+_CLC_OVERLOAD _CLC_DECL float fast_length(float4 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double2 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double3 p);
+_CLC_OVERLOAD _CLC_DECL double fast_length(double4 p);
+
+_CLC_OVERLOAD _CLC_INLINE float distance(float p0, float p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE float distance(float2 p0, float2 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float distance(float3 p0, float3 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float distance(float4 p0, float4 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double p0, double p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE double distance(double2 p0, double2 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double3 p0, double3 p1) { return length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double distance(double4 p0, double4 p1) { return length(p1-p0); }
+
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float p0, float p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float2 p0, float2 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float3 p0, float3 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE float fast_distance(float4 p0, float4 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double p0, double p1) { return fabs(p1-p0);}
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double2 p0, double2 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double3 p0, double3 p1) { return fast_length(p1-p0); }
+_CLC_OVERLOAD _CLC_INLINE double fast_distance(double4 p0, double4 p1) { return fast_length(p1-p0); }
+
+_CLC_OVERLOAD _CLC_INLINE float normalize(float p)
+{return p > 0.0f ? 1.0f : p < 0.0f ? -1.0f : 0.0f;}
+
+_CLC_OVERLOAD _CLC_INLINE double normalize(double p)
+{return p > 0.0 ? 1.0 : p < 0.0 ? -1.0 : 0.0;}
+
+_CLC_OVERLOAD _CLC_INLINE float fast_normalize(float p)
+{return p > 0.0f ? 1.0f : p < 0.0f ? -1.0f : 0.0f;}
+
+_CLC_OVERLOAD _CLC_INLINE double fast_normalize(double p)
+{return p > 0.0 ? 1.0 : p < 0.0 ? -1.0 : 0.0;}
+
+_CLC_OVERLOAD _CLC_INLINE float2 normalize(float2 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE float3 normalize(float3 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE float4 normalize(float4 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double2 normalize(double2 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double3 normalize(double3 p) { return p / length(p); }
+_CLC_OVERLOAD _CLC_INLINE double4 normalize(double4 p) { return p / length(p); }
+
+_CLC_OVERLOAD _CLC_INLINE float2 fast_normalize(float2 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE float3 fast_normalize(float3 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE float4 fast_normalize(float4 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double2 fast_normalize(double2 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double3 fast_normalize(double3 p) { return p / fast_length(p); }
+_CLC_OVERLOAD _CLC_INLINE double4 fast_normalize(double4 p) { return p / fast_length(p); }
+
+/*-----------------------------------------------------------------------------
+* Atomics
+*----------------------------------------------------------------------------*/
+_CLC_OVERLOAD _CLC_DECL int atomic_add(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_add(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_add(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_add(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_sub(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_sub(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_sub(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_sub(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_xchg(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xchg(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL float atomic_xchg(volatile global float* p, float val);
+_CLC_OVERLOAD _CLC_DECL int atomic_xchg(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xchg(volatile local uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL float atomic_xchg(volatile local float* p, float val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_inc(volatile global int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_inc(volatile global uint* p);
+_CLC_OVERLOAD _CLC_DECL int atomic_inc(volatile local int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_inc(volatile local uint* p);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_dec(volatile global int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_dec(volatile global uint* p);
+_CLC_OVERLOAD _CLC_DECL int atomic_dec(volatile local int* p);
+_CLC_OVERLOAD _CLC_DECL uint atomic_dec(volatile local uint* p);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_cmpxchg(volatile global int* p, int cmp, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_cmpxchg(volatile global uint* p, uint cmp, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_cmpxchg(volatile local int* p, int cmp, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_cmpxchg(volatile local uint* p, uint cmp, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_min(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_min(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_min(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_min(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_max(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_max(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_max(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_max(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_and(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_and(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_and(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_and(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_or(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_or(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_or(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_or(volatile local uint* p, uint val);
+
+_CLC_OVERLOAD _CLC_DECL int atomic_xor(volatile global int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xor(volatile global uint* p, uint val);
+_CLC_OVERLOAD _CLC_DECL int atomic_xor(volatile local int* p, int val);
+_CLC_OVERLOAD _CLC_DECL uint atomic_xor(volatile local uint* p, uint val);
+
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_xchg atomic_xchg
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+
+#define TEMPLATE2(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle(res_elemt##val_vnum val, mask_elemt##2 mask);\
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##2 mask);
+
+
+#define TEMPLATE4(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle(res_elemt##val_vnum val, mask_elemt##4 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##4 mask);
+
+
+#define TEMPLATE8(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle(res_elemt##val_vnum val, mask_elemt##8 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##8 mask);
+
+
+#define TEMPLATE16(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle(res_elemt##val_vnum val, mask_elemt##16 mask); \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##16 mask);
+
+#define CROSS_SIZE(type1, type2) \
+TEMPLATE2(type1, 2, type2) \
+TEMPLATE2(type1, 4, type2) \
+TEMPLATE2(type1, 8, type2) \
+TEMPLATE2(type1, 16, type2) \
+TEMPLATE4(type1, 2, type2) \
+TEMPLATE4(type1, 4, type2) \
+TEMPLATE4(type1, 8, type2) \
+TEMPLATE4(type1, 16, type2) \
+TEMPLATE8(type1, 2, type2) \
+TEMPLATE8(type1, 4, type2) \
+TEMPLATE8(type1, 8, type2) \
+TEMPLATE8(type1, 16, type2) \
+TEMPLATE16(type1, 2, type2) \
+TEMPLATE16(type1, 4, type2) \
+TEMPLATE16(type1, 8, type2) \
+TEMPLATE16(type1, 16, type2) \
+
+#define CROSS_MASKTYPE(type) \
+CROSS_SIZE(type, uchar) \
+CROSS_SIZE(type, ushort) \
+CROSS_SIZE(type, uint) \
+CROSS_SIZE(type, ulong) \
+
+CROSS_MASKTYPE(char)
+CROSS_MASKTYPE(uchar)
+CROSS_MASKTYPE(short)
+CROSS_MASKTYPE(ushort)
+CROSS_MASKTYPE(int)
+CROSS_MASKTYPE(uint)
+CROSS_MASKTYPE(long)
+CROSS_MASKTYPE(ulong)
+CROSS_MASKTYPE(float)
+CROSS_MASKTYPE(double)
+
+#undef TEMPLATE2
+#undef TEMPLATE4
+#undef TEMPLATE8
+#undef TEMPLATE16
+#undef CROSS_SIZE
+#undef CROSS_MASKTYPE
+
+#endif //_CLC_H_
diff --git a/include/cpu.h b/include/cpu.h
new file mode 100644
index 0000000..6fa0d90
--- /dev/null
+++ b/include/cpu.h
@@ -0,0 +1,262 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _CPU_CLC_H_
+#define _CPU_CLC_H_
+
+#include "clc.h"
+
+#define PREFETCH_VECTORIZE(PRIM_TYPE) \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE *p, size_t num_gentypes); \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE##2 *p, size_t num_gentypes); \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE##3 *p, size_t num_gentypes); \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE##4 *p, size_t num_gentypes); \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE##8 *p, size_t num_gentypes); \
+ _CLC_OVERLOAD _CLC_DECL void prefetch(const __global PRIM_TYPE##16 *p, size_t num_gentypes); \
+
+PREFETCH_VECTORIZE(char)
+PREFETCH_VECTORIZE(uchar)
+PREFETCH_VECTORIZE(short)
+PREFETCH_VECTORIZE(ushort)
+PREFETCH_VECTORIZE(int)
+PREFETCH_VECTORIZE(uint)
+PREFETCH_VECTORIZE(long)
+PREFETCH_VECTORIZE(ulong)
+PREFETCH_VECTORIZE(float)
+PREFETCH_VECTORIZE(double)
+
+/*-----------------------------------------------------------------------------
+* This can be empty since our copy routines are currently synchronous. When
+* the copy routines are improved to be asynchronous, then this function will
+* need a real implementation.
+*----------------------------------------------------------------------------*/
+#define wait_group_events(num_events, event_list)
+
+#define CROSS_SIZES(type) \
+ TEMPLATE(type) \
+ TEMPLATE(_VEC_TYPE(type,2)) \
+ TEMPLATE(_VEC_TYPE(type,3)) \
+ TEMPLATE(_VEC_TYPE(type,4)) \
+ TEMPLATE(_VEC_TYPE(type,8)) \
+ TEMPLATE(_VEC_TYPE(type,16)) \
+
+#define TEMPLATE(gentype) \
+_CLC_OVERLOAD _CLC_DECL event_t async_work_group_copy(local gentype *dst, const global gentype *src, \
+ size_t num_gentypes, event_t event); \
+_CLC_OVERLOAD _CLC_DECL event_t async_work_group_copy(global gentype *dst, const local gentype *src, \
+ size_t num_gentypes, event_t event); \
+_CLC_OVERLOAD _CLC_DECL event_t async_work_group_copy(global gentype *dst, const global gentype *src, \
+ size_t num_gentypes, event_t event); \
+_CLC_OVERLOAD _CLC_DECL event_t async_work_group_strided_copy(local gentype *dst, const global gentype *src, \
+ size_t num_gentypes, size_t src_stride, event_t event); \
+_CLC_OVERLOAD _CLC_DECL event_t async_work_group_strided_copy(global gentype *dst, const local gentype *src, \
+ size_t num_gentypes, size_t dst_stride, event_t event); \
+
+CROSS_SIZES(char)
+CROSS_SIZES(uchar)
+CROSS_SIZES(short)
+CROSS_SIZES(ushort)
+CROSS_SIZES(int)
+CROSS_SIZES(uint)
+CROSS_SIZES(long)
+CROSS_SIZES(ulong)
+CROSS_SIZES(float)
+CROSS_SIZES(double)
+
+#undef CROSS_SIZES
+#undef TEMPLATE
+
+_CLC_OVERLOAD _CLC_DECL char rotate(char v, char i);
+_CLC_OVERLOAD _CLC_DECL uchar rotate(uchar v, uchar i);
+_CLC_OVERLOAD _CLC_DECL short rotate(short v, short i);
+_CLC_OVERLOAD _CLC_DECL ushort rotate(ushort v, ushort i);
+_CLC_OVERLOAD _CLC_DECL long rotate(long v, long i);
+_CLC_OVERLOAD _CLC_DECL ulong rotate(ulong v, ulong i);
+_CLC_OVERLOAD _CLC_DECL int rotate(int v, int i);
+_CLC_OVERLOAD _CLC_DECL uint rotate(uint v, uint i);
+
+BINARY_VEC_DECL(char, char, rotate)
+BINARY_VEC_DECL(uchar, uchar, rotate)
+BINARY_VEC_DECL(short, short, rotate)
+BINARY_VEC_DECL(ushort, ushort, rotate)
+BINARY_VEC_DECL(int, int, rotate)
+BINARY_VEC_DECL(uint, uint, rotate)
+BINARY_VEC_DECL(long, long, rotate)
+BINARY_VEC_DECL(ulong, ulong, rotate)
+
+_CLC_OVERLOAD _CLC_DECL char clz(char v) ;
+_CLC_OVERLOAD _CLC_DECL uchar clz(uchar v) ;
+_CLC_OVERLOAD _CLC_DECL short clz(short v) ;
+_CLC_OVERLOAD _CLC_DECL ushort clz(ushort v) ;
+_CLC_OVERLOAD _CLC_DECL int clz(int v) ;
+_CLC_OVERLOAD _CLC_DECL uint clz(uint v) ;
+_CLC_OVERLOAD _CLC_DECL long clz(long v) ;
+_CLC_OVERLOAD _CLC_DECL ulong clz(ulong v);
+
+UNARY_VEC_DECL(char, char, clz)
+UNARY_VEC_DECL(uchar, uchar, clz)
+UNARY_VEC_DECL(short, short, clz)
+UNARY_VEC_DECL(ushort, ushort, clz)
+UNARY_VEC_DECL(int, int, clz)
+UNARY_VEC_DECL(uint, uint, clz)
+UNARY_VEC_DECL(long, long, clz)
+UNARY_VEC_DECL(ulong, ulong, clz)
+
+_CLC_OVERLOAD _CLC_DECL uchar abs(char x) ;
+_CLC_OVERLOAD _CLC_DECL ushort abs(short x) ;
+_CLC_OVERLOAD _CLC_DECL uint abs(int x) ;
+_CLC_OVERLOAD _CLC_DECL ulong abs(long x) ;
+_CLC_OVERLOAD _CLC_DECL uchar abs(uchar x) ;
+_CLC_OVERLOAD _CLC_DECL ushort abs(ushort x) ;
+_CLC_OVERLOAD _CLC_DECL uint abs(uint x) ;
+_CLC_OVERLOAD _CLC_DECL ulong abs(ulong x) ;
+
+UNARY_VEC_DECL(char, uchar, abs)
+UNARY_VEC_DECL(short, ushort, abs)
+UNARY_VEC_DECL(int, uint, abs)
+UNARY_VEC_DECL(long, ulong, abs)
+
+/*-----------------------------------------------------------------------------
+* ABS for unsigned types is straightforward
+*----------------------------------------------------------------------------*/
+#define DEFINE(type, utype) \
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,2) abs(_VEC_TYPE(utype,2) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,3) abs(_VEC_TYPE(utype,3) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,4) abs(_VEC_TYPE(utype,4) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,8) abs(_VEC_TYPE(utype,8) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,16) abs(_VEC_TYPE(utype,16) x) {return x;}\
+
+DEFINE(uchar, uchar)
+DEFINE(ushort, ushort)
+DEFINE(uint, uint)
+DEFINE(ulong, ulong)
+
+#undef DEFINE
+
+_CLC_OVERLOAD _CLC_DECL long mul_hi(long x, long y);
+_CLC_OVERLOAD _CLC_DECL ulong mul_hi(ulong x, ulong y);
+_CLC_OVERLOAD _CLC_DECL char mul_hi(char x, char y);
+_CLC_OVERLOAD _CLC_DECL uchar mul_hi(uchar x, uchar y);
+_CLC_OVERLOAD _CLC_DECL short mul_hi(short x, short y);
+_CLC_OVERLOAD _CLC_DECL ushort mul_hi(ushort x, ushort y);
+_CLC_OVERLOAD _CLC_DECL int mul_hi(int x, int y);
+_CLC_OVERLOAD _CLC_DECL uint mul_hi(uint x, uint y);
+
+BINARY_VEC_DECL(char, char, mul_hi)
+BINARY_VEC_DECL(uchar, uchar, mul_hi)
+BINARY_VEC_DECL(short, short, mul_hi)
+BINARY_VEC_DECL(ushort, ushort, mul_hi)
+BINARY_VEC_DECL(int, int, mul_hi)
+BINARY_VEC_DECL(uint, uint, mul_hi)
+BINARY_VEC_DECL(long, long, mul_hi)
+BINARY_VEC_DECL(ulong, ulong, mul_hi)
+
+
+_CLC_OVERLOAD _CLC_DECL char add_sat(char x, char y) ;
+_CLC_OVERLOAD _CLC_DECL uchar add_sat(uchar x, uchar y) ;
+_CLC_OVERLOAD _CLC_DECL short add_sat(short x, short y) ;
+_CLC_OVERLOAD _CLC_DECL ushort add_sat(ushort x, ushort y) ;
+_CLC_OVERLOAD _CLC_DECL int add_sat(int x, int y) ;
+_CLC_OVERLOAD _CLC_DECL uint add_sat(uint x, uint y) ;
+_CLC_OVERLOAD _CLC_DECL long add_sat(long x, long y) ;
+_CLC_OVERLOAD _CLC_DECL ulong add_sat(ulong x, ulong y) ;
+
+BINARY_VEC_DECL(char, char, add_sat)
+BINARY_VEC_DECL(uchar, uchar, add_sat)
+BINARY_VEC_DECL(short, short, add_sat)
+BINARY_VEC_DECL(ushort, ushort, add_sat)
+BINARY_VEC_DECL(int, int, add_sat)
+BINARY_VEC_DECL(uint, uint, add_sat)
+BINARY_VEC_DECL(long, long, add_sat)
+BINARY_VEC_DECL(ulong, ulong, add_sat)
+
+
+_CLC_OVERLOAD _CLC_DECL char sub_sat(char x, char y) ;
+_CLC_OVERLOAD _CLC_DECL uchar sub_sat(uchar x, uchar y) ;
+_CLC_OVERLOAD _CLC_DECL short sub_sat(short x, short y) ;
+_CLC_OVERLOAD _CLC_DECL ushort sub_sat(ushort x, ushort y) ;
+_CLC_OVERLOAD _CLC_DECL int sub_sat(int x, int y) ;
+_CLC_OVERLOAD _CLC_DECL uint sub_sat(uint x, uint y) ;
+_CLC_OVERLOAD _CLC_DECL long sub_sat(long x, long y) ;
+_CLC_OVERLOAD _CLC_DECL ulong sub_sat(ulong x, ulong y) ;
+
+BINARY_VEC_DECL(char, char, sub_sat)
+BINARY_VEC_DECL(uchar, uchar, sub_sat)
+BINARY_VEC_DECL(short, short, sub_sat)
+BINARY_VEC_DECL(ushort, ushort, sub_sat)
+BINARY_VEC_DECL(int, int, sub_sat)
+BINARY_VEC_DECL(uint, uint, sub_sat)
+BINARY_VEC_DECL(long, long, sub_sat)
+BINARY_VEC_DECL(ulong, ulong, sub_sat)
+
+_CLC_OVERLOAD _CLC_DECL short upsample(char x, uchar y) ;
+_CLC_OVERLOAD _CLC_DECL ushort upsample(uchar x, uchar y) ;
+_CLC_OVERLOAD _CLC_DECL int upsample(short x, ushort y) ;
+_CLC_OVERLOAD _CLC_DECL uint upsample(ushort x, ushort y) ;
+_CLC_OVERLOAD _CLC_DECL long upsample(int x, uint y) ;
+_CLC_OVERLOAD _CLC_DECL ulong upsample(uint x, uint y) ;
+
+BINARY_VEC_DECL_ALT(char, short, uchar, upsample)
+BINARY_VEC_DECL_ALT(uchar, ushort, uchar, upsample)
+BINARY_VEC_DECL_ALT(short, int, ushort, upsample)
+BINARY_VEC_DECL_ALT(ushort, uint, ushort, upsample)
+BINARY_VEC_DECL_ALT(int, long, uint, upsample)
+BINARY_VEC_DECL_ALT(uint, ulong, uint, upsample)
+
+_CLC_OVERLOAD _CLC_DECL char mad_sat(char a, char b, char c);
+_CLC_OVERLOAD _CLC_DECL uchar mad_sat(uchar a, uchar b, uchar c);
+_CLC_OVERLOAD _CLC_DECL short mad_sat(short a, short b, short c);
+_CLC_OVERLOAD _CLC_DECL ushort mad_sat(ushort a, ushort b, ushort c);
+_CLC_OVERLOAD _CLC_DECL int mad_sat(int a, int b, int c);
+_CLC_OVERLOAD _CLC_DECL uint mad_sat(uint a, uint b, uint c);
+_CLC_OVERLOAD _CLC_DECL long mad_sat(long a, long b, long c);
+_CLC_OVERLOAD _CLC_DECL ulong mad_sat(ulong a, ulong b, ulong c);
+
+TERNARY_VEC_DECL(char, char, mad_sat)
+TERNARY_VEC_DECL(uchar, uchar, mad_sat)
+TERNARY_VEC_DECL(short, short, mad_sat)
+TERNARY_VEC_DECL(ushort, ushort, mad_sat)
+TERNARY_VEC_DECL(int, int, mad_sat)
+TERNARY_VEC_DECL(uint, uint, mad_sat)
+TERNARY_VEC_DECL(long, long, mad_sat)
+TERNARY_VEC_DECL(ulong, ulong, mad_sat)
+
+int printf(const char* _format, ...);
+void *memcpy(void *dst, const void * src, uint size);
+
+_CLC_DECL size_t get_local_id (uint dim);
+_CLC_DECL uint get_work_dim (void) ;
+_CLC_DECL size_t get_global_size (uint dim) ;
+_CLC_DECL size_t get_local_size (uint dim) ;
+_CLC_DECL size_t get_global_offset(uint dim) ;
+_CLC_DECL size_t __get_global_first(uint dim) ;
+_CLC_DECL size_t get_num_groups (uint dim) ;
+_CLC_DECL size_t get_global_id (uint dim) ;
+_CLC_DECL size_t get_group_id (uint dim) ;
+
+#endif //_CPU_CLC_H_
+
diff --git a/include/dsp.h b/include/dsp.h
new file mode 100644
index 0000000..b4fe9d5
--- /dev/null
+++ b/include/dsp.h
@@ -0,0 +1,490 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _DSP_CLC_H_
+#define _DSP_CLC_H_
+
+#include "clc.h"
+
+void __touch(const __global char *p, uint32_t size);
+
+#define PREFETCH_VECTORIZE(PRIM_TYPE) \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE##2 *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE##3 *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE##4 *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE##8 *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+ _CLC_OVERLOAD _CLC_INLINE void prefetch(const __global PRIM_TYPE##16 *p, size_t num_gentypes) \
+ { __touch((const __global char*)p, (uint32_t)(num_gentypes * sizeof(*p))); } \
+
+#define PREFETCH_TYPES() \
+ PREFETCH_VECTORIZE(char) \
+ PREFETCH_VECTORIZE(uchar) \
+ PREFETCH_VECTORIZE(short) \
+ PREFETCH_VECTORIZE(ushort) \
+ PREFETCH_VECTORIZE(int) \
+ PREFETCH_VECTORIZE(uint) \
+ PREFETCH_VECTORIZE(long) \
+ PREFETCH_VECTORIZE(ulong) \
+ PREFETCH_VECTORIZE(float) \
+ PREFETCH_VECTORIZE(double) \
+
+PREFETCH_TYPES()
+
+_CLC_DECL size_t get_local_id (uint dim);
+
+void *memcpy(void *dst, const void * src, uint size);
+
+/*-----------------------------------------------------------------------------
+* This can be empty since our copy routines are currently synchronous. When
+* the copy routines are improved to be asynchronous, then this function will
+* need a real implementation.
+*----------------------------------------------------------------------------*/
+#define wait_group_events(num_events, event_list)
+
+#define VEC_TYPE(type,sz) type##sz
+
+#define CROSS_SIZES(type) \
+ TEMPLATE(type) \
+ TEMPLATE(VEC_TYPE(type,2)) \
+ TEMPLATE(VEC_TYPE(type,3)) \
+ TEMPLATE(VEC_TYPE(type,4)) \
+ TEMPLATE(VEC_TYPE(type,8)) \
+ TEMPLATE(VEC_TYPE(type,16)) \
+
+#define CROSS_TYPES() \
+ CROSS_SIZES(char) \
+ CROSS_SIZES(uchar) \
+ CROSS_SIZES(short) \
+ CROSS_SIZES(ushort) \
+ CROSS_SIZES(int) \
+ CROSS_SIZES(uint) \
+ CROSS_SIZES(long) \
+ CROSS_SIZES(ulong) \
+ CROSS_SIZES(float) \
+ CROSS_SIZES(double) \
+
+#define TEMPLATE(gentype) \
+_CLC_OVERLOAD _CLC_INLINE event_t async_work_group_copy(local gentype *dst, const global gentype *src, \
+ size_t num_gentypes, event_t event) \
+{ \
+ if ((get_local_id(0) | get_local_id(1) | get_local_id(2)) == 0) \
+ memcpy((char*)dst, (const char*) src, num_gentypes * sizeof(gentype)); \
+ return 0; \
+} \
+_CLC_OVERLOAD _CLC_INLINE event_t async_work_group_copy(global gentype *dst, const local gentype *src, \
+ size_t num_gentypes, event_t event) \
+{ \
+ if ((get_local_id(0) | get_local_id(1) | get_local_id(2)) == 0) \
+ memcpy((char*)dst, (const char*) src, num_gentypes * sizeof(gentype)); \
+ return 0; \
+} \
+_CLC_OVERLOAD _CLC_INLINE event_t async_work_group_copy(global gentype *dst, const global gentype *src, \
+ size_t num_gentypes, event_t event) \
+{ \
+ if ((get_local_id(0) | get_local_id(1) | get_local_id(2)) == 0) \
+ memcpy((char*)dst, (const char*) src, num_gentypes * sizeof(gentype)); \
+ return 0; \
+} \
+
+CROSS_TYPES()
+
+#undef TEMPLATE
+#define TEMPLATE(gentype) \
+_CLC_OVERLOAD _CLC_INLINE event_t async_work_group_strided_copy(local gentype *dst, const global gentype *src, \
+ size_t num_gentypes, size_t src_stride, event_t event) \
+{ int i; \
+ if ((get_local_id(0) | get_local_id(1) | get_local_id(2)) == 0) \
+ for (i=0; i < num_gentypes; ++i) dst[i] = src[i*src_stride]; \
+ return 0; \
+} \
+_CLC_OVERLOAD _CLC_INLINE event_t async_work_group_strided_copy(global gentype *dst, const local gentype *src, \
+ size_t num_gentypes, size_t dst_stride, event_t event) \
+{ int i; \
+ if ((get_local_id(0) | get_local_id(1) | get_local_id(2)) == 0) \
+ for (i=0; i < num_gentypes; ++i) dst[i*dst_stride] = src[i]; \
+ return 0; \
+} \
+
+CROSS_TYPES()
+
+#undef VEC_TYPE
+#undef CROSS_SIZES
+#undef CROSS_TYPES
+#undef TEMPLATE
+
+
+_CLC_OVERLOAD _CLC_DECL char rotate(char v, char i);
+_CLC_OVERLOAD _CLC_DECL uchar rotate(uchar v, uchar i);
+_CLC_OVERLOAD _CLC_DECL short rotate(short v, short i);
+_CLC_OVERLOAD _CLC_DECL ushort rotate(ushort v, ushort i);
+_CLC_OVERLOAD _CLC_INLINE int rotate(int v, int i) { return _rotl(v,i); }
+_CLC_OVERLOAD _CLC_INLINE uint rotate(uint v, uint i) { return _rotl(v,i); }
+_CLC_OVERLOAD _CLC_DECL long rotate(long v, long i);
+_CLC_OVERLOAD _CLC_DECL ulong rotate(ulong v, ulong i);
+
+BINARY_VEC_DECL(char, char, rotate)
+BINARY_VEC_DECL(uchar, uchar, rotate)
+BINARY_VEC_DECL(short, short, rotate)
+BINARY_VEC_DECL(ushort, ushort, rotate)
+BINARY_VEC_DECL(int, int, rotate)
+BINARY_VEC_DECL(uint, uint, rotate)
+BINARY_VEC_DECL(long, long, rotate)
+BINARY_VEC_DECL(ulong, ulong, rotate)
+
+_CLC_OVERLOAD _CLC_INLINE char clz(char v) { return v<0?0: _lmbd(1,v)-24; }
+_CLC_OVERLOAD _CLC_INLINE uchar clz(uchar v) { return _lmbd(1, v) - 24; }
+_CLC_OVERLOAD _CLC_INLINE short clz(short v) { return v<0?0: _lmbd(1,v)-16; }
+_CLC_OVERLOAD _CLC_INLINE ushort clz(ushort v) { return _lmbd(1, v) - 16; }
+_CLC_OVERLOAD _CLC_INLINE int clz(int v) { return _lmbd(1, v); }
+_CLC_OVERLOAD _CLC_INLINE uint clz(uint v) { return _lmbd(1, v); }
+
+_CLC_OVERLOAD _CLC_INLINE long clz(long v)
+{
+ uint2 tmp = as_uint2(v);
+ return tmp.hi ? _lmbd(1, tmp.hi) : _lmbd(1, tmp.lo) + 32;
+}
+
+_CLC_OVERLOAD _CLC_INLINE ulong clz(ulong v)
+{
+ uint2 tmp = as_uint2(v);
+ return tmp.hi ? _lmbd(1, tmp.hi) : _lmbd(1, tmp.lo) + 32;
+}
+
+UNARY_VEC_DECL(char, char, clz)
+UNARY_VEC_DECL(uchar, uchar, clz)
+UNARY_VEC_DECL(short, short, clz)
+UNARY_VEC_DECL(ushort, ushort, clz)
+UNARY_VEC_DECL(int, int, clz)
+UNARY_VEC_DECL(uint, uint, clz)
+UNARY_VEC_DECL(long, long, clz)
+UNARY_VEC_DECL(ulong, ulong, clz)
+
+
+_CLC_OVERLOAD _CLC_INLINE uchar abs(char x) { return _abs(x); }
+_CLC_OVERLOAD _CLC_INLINE ushort abs(short x) { return _abs(x); }
+_CLC_OVERLOAD _CLC_INLINE uint abs(int x) { return _abs(x); }
+_CLC_OVERLOAD _CLC_INLINE ulong abs(long x) { if (x < 0) x = -x; return x; }
+
+_CLC_OVERLOAD _CLC_INLINE uchar abs(uchar x) { return x; }
+_CLC_OVERLOAD _CLC_INLINE ushort abs(ushort x) { return x; }
+_CLC_OVERLOAD _CLC_INLINE uint abs(uint x) { return x; }
+_CLC_OVERLOAD _CLC_INLINE ulong abs(ulong x) { return x; }
+
+UNARY_VEC_DECL(char, uchar, abs)
+UNARY_VEC_DECL(short, ushort, abs)
+UNARY_VEC_DECL(int, uint, abs)
+UNARY_VEC_DECL(long, ulong, abs)
+
+/*-----------------------------------------------------------------------------
+* ABS for unsigned types is straightforward
+*----------------------------------------------------------------------------*/
+#define DEFINE(type, utype) \
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,2) abs(_VEC_TYPE(utype,2) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,3) abs(_VEC_TYPE(utype,3) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,4) abs(_VEC_TYPE(utype,4) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,8) abs(_VEC_TYPE(utype,8) x) {return x;}\
+ _CLC_OVERLOAD _CLC_INLINE _VEC_TYPE(utype,16) abs(_VEC_TYPE(utype,16) x) {return x;}\
+
+DEFINE(uchar, uchar)
+DEFINE(ushort, ushort)
+DEFINE(uint, uint)
+DEFINE(ulong, ulong)
+
+#undef DEFINE
+
+_CLC_OVERLOAD _CLC_DECL long mul_hi(long x, long y);
+_CLC_OVERLOAD _CLC_DECL ulong mul_hi(ulong x, ulong y);
+
+_CLC_OVERLOAD _CLC_INLINE char mul_hi(char x, char y)
+{ return _mpy(x,y) >> 8; }
+
+_CLC_OVERLOAD _CLC_INLINE uchar mul_hi(uchar x, uchar y)
+{ return _mpyu(x,y) >> 8; }
+
+_CLC_OVERLOAD _CLC_INLINE short mul_hi(short x, short y)
+{ return _mpy(x,y) >> 16; }
+
+_CLC_OVERLOAD _CLC_INLINE ushort mul_hi(ushort x, ushort y)
+{ return _mpyu(x,y) >> 16; }
+
+_CLC_OVERLOAD _CLC_INLINE int mul_hi(int x, int y)
+{ return ((long)x * (long)y) >> 32; }
+
+_CLC_OVERLOAD _CLC_INLINE uint mul_hi(uint x, uint y)
+{ return ((ulong)x * (ulong)y) >> 32; }
+
+BINARY_VEC_DECL(char, char, mul_hi)
+BINARY_VEC_DECL(uchar, uchar, mul_hi)
+BINARY_VEC_DECL(short, short, mul_hi)
+BINARY_VEC_DECL(ushort, ushort, mul_hi)
+BINARY_VEC_DECL(int, int, mul_hi)
+BINARY_VEC_DECL(uint, uint, mul_hi)
+BINARY_VEC_DECL(long, long, mul_hi)
+BINARY_VEC_DECL(ulong, ulong, mul_hi)
+
+_CLC_OVERLOAD _CLC_INLINE char add_sat(char x, char y)
+{ return _sadd(x<<24, y<<24)>>24; }
+
+_CLC_OVERLOAD _CLC_INLINE uchar add_sat(uchar x, uchar y)
+{ return _saddu4(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE short add_sat(short x, short y)
+{ return _sadd2(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE ushort add_sat(ushort x, ushort y)
+{
+ int tmp = x + y;
+ if (tmp >> 16) return USHRT_MAX;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE int add_sat(int x, int y)
+{ return _sadd(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE uint add_sat(uint x, uint y)
+{
+ ulong tmp = (ulong)x + (ulong)y;
+ if (tmp >> 32) return UINT_MAX;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE long add_sat(long x, long y)
+{
+ if (x > 0 && y > (LONG_MAX-x)) return LONG_MAX;
+ if (x < 0 && y < (LONG_MIN-x)) return LONG_MIN;
+ return x + y;
+}
+
+_CLC_OVERLOAD _CLC_INLINE ulong add_sat(ulong x, ulong y)
+{
+ if (y > (ULONG_MAX-x)) return ULONG_MAX;
+ return x + y;
+}
+
+BINARY_VEC_DECL(char, char, add_sat)
+BINARY_VEC_DECL(uchar, uchar, add_sat)
+BINARY_VEC_DECL(short, short, add_sat)
+BINARY_VEC_DECL(ushort, ushort, add_sat)
+BINARY_VEC_DECL(int, int, add_sat)
+BINARY_VEC_DECL(uint, uint, add_sat)
+BINARY_VEC_DECL(long, long, add_sat)
+BINARY_VEC_DECL(ulong, ulong, add_sat)
+
+
+_CLC_OVERLOAD _CLC_INLINE char sub_sat(char x, char y)
+{ return _ssub(x<<24, y<<24)>>24; }
+
+_CLC_OVERLOAD _CLC_INLINE uchar sub_sat(uchar x, uchar y)
+{
+ if (y > x) return 0;
+ return x-y;
+}
+
+_CLC_OVERLOAD _CLC_INLINE short sub_sat(short x, short y)
+{ return _ssub2(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE ushort sub_sat(ushort x, ushort y)
+{
+ if (y > x) return 0;
+ return x-y;
+}
+
+_CLC_OVERLOAD _CLC_INLINE int sub_sat(int x, int y)
+{ return _ssub(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE uint sub_sat(uint x, uint y)
+{
+ if (y > x) return 0;
+ return x-y;
+}
+
+_CLC_OVERLOAD _CLC_INLINE long sub_sat(long x, long y)
+{
+ if (x > 0 && -y > (LONG_MAX-x)) return LONG_MAX;
+ if (x < 0 && -y < (LONG_MIN-x)) return LONG_MIN;
+
+ return x - y;
+}
+
+_CLC_OVERLOAD _CLC_INLINE ulong sub_sat(ulong x, ulong y)
+{
+ if (y > x) return 0;
+ return x-y;
+}
+
+BINARY_VEC_DECL(char, char, sub_sat)
+BINARY_VEC_DECL(uchar, uchar, sub_sat)
+BINARY_VEC_DECL(short, short, sub_sat)
+BINARY_VEC_DECL(ushort, ushort, sub_sat)
+BINARY_VEC_DECL(int, int, sub_sat)
+BINARY_VEC_DECL(uint, uint, sub_sat)
+BINARY_VEC_DECL(long, long, sub_sat)
+BINARY_VEC_DECL(ulong, ulong, sub_sat)
+
+
+_CLC_OVERLOAD _CLC_INLINE short upsample(char x, uchar y)
+{ return (short)x << 8 | y; }
+
+_CLC_OVERLOAD _CLC_INLINE ushort upsample(uchar x, uchar y)
+{ return (ushort)x << 8 | y; }
+
+_CLC_OVERLOAD _CLC_INLINE int upsample(short x, ushort y)
+{ return (int) _pack2(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE uint upsample(ushort x, ushort y)
+{ return (uint) _pack2(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE long upsample(int x, uint y)
+{ return (long) _itoll(x,y); }
+
+_CLC_OVERLOAD _CLC_INLINE ulong upsample(uint x, uint y)
+{ return (ulong) _itoll(x,y); }
+
+BINARY_VEC_DECL_ALT(char, short, uchar, upsample)
+BINARY_VEC_DECL_ALT(uchar, ushort, uchar, upsample)
+BINARY_VEC_DECL_ALT(short, int, ushort, upsample)
+BINARY_VEC_DECL_ALT(ushort, uint, ushort, upsample)
+BINARY_VEC_DECL_ALT(int, long, uint, upsample)
+BINARY_VEC_DECL_ALT(uint, ulong, uint, upsample)
+
+
+_CLC_OVERLOAD _CLC_INLINE char mad_sat(char a, char b, char c)
+{
+ int tmp = _mpy32(a,b);
+ tmp += c;
+
+ if (tmp > (int)CHAR_MAX) return CHAR_MAX;
+ if (tmp < (int)CHAR_MIN) return CHAR_MIN;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE uchar mad_sat(uchar a, uchar b, uchar c)
+{
+ uint tmp = _mpy32u(a,b);
+ tmp += c;
+
+ if (tmp > (uint)UCHAR_MAX) return UCHAR_MAX;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE short mad_sat(short a, short b, short c)
+{
+ int tmp = _mpy32(a,b);
+ tmp += c;
+
+ if (tmp > (int)SHRT_MAX) return SHRT_MAX;
+ if (tmp < (int)SHRT_MIN) return SHRT_MIN;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE ushort mad_sat(ushort a, ushort b, ushort c)
+{
+ uint tmp = _mpy32u(a,b);
+ tmp += c;
+
+ if (tmp > (uint)USHRT_MAX) return USHRT_MAX;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE int mad_sat(int a, int b, int c)
+{
+ long tmp = (long)a * (long)b + (long)c;
+ if (tmp > (long)INT_MAX) return INT_MAX;
+ if (tmp < (long)INT_MIN) return INT_MIN;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE uint mad_sat(uint a, uint b, uint c)
+{
+ ulong tmp = _mpy32u(a,b);
+ tmp += c;
+
+ if (tmp > (ulong)UINT_MAX) return UINT_MAX;
+ return tmp;
+}
+
+_CLC_OVERLOAD _CLC_INLINE long mad_sat(long a, long b, long c)
+{
+ if (a > 0 && b > 0 && a > (LONG_MAX/b)) return LONG_MAX;
+ if (a > 0 && b < 0 && b < (LONG_MIN/a)) return LONG_MIN;
+ if (a < 0 && b > 0 && a < (LONG_MIN/b)) return LONG_MIN;
+ if (a < 0 && b < 0 && b < (LONG_MAX/a)) return LONG_MAX;
+
+ return add_sat(a*b, c);
+}
+
+_CLC_OVERLOAD _CLC_INLINE ulong mad_sat(ulong a, ulong b, ulong c)
+{
+ if (a > (ULONG_MAX/b)) return ULONG_MAX;
+ return add_sat(a*b, c);
+}
+
+TERNARY_VEC_DECL(char, char, mad_sat)
+TERNARY_VEC_DECL(uchar, uchar, mad_sat)
+TERNARY_VEC_DECL(short, short, mad_sat)
+TERNARY_VEC_DECL(ushort, ushort, mad_sat)
+TERNARY_VEC_DECL(int, int, mad_sat)
+TERNARY_VEC_DECL(uint, uint, mad_sat)
+TERNARY_VEC_DECL(long, long, mad_sat)
+TERNARY_VEC_DECL(ulong, ulong, mad_sat)
+
+
+int printf(const char* _format, ...);
+
+uint32_t __core_num (void);
+uint32_t __clock (void);
+uint64_t __clock64 (void);
+void __cycle_delay (uint64_t cyclesToDelay);
+void __mfence (void);
+
+void __ocl_cache_l1d_off (void);
+void __ocl_cache_l1d_std (void);
+void __ocl_cache_l1d_half (void);
+void __ocl_cache_l1d_wbinv_all (void);
+
+extern constant const uint kernel_config_l2[32];
+
+_CLC_DECL size_t get_local_id (uint dim);
+
+_CLC_INLINE uint get_work_dim (void) { return kernel_config_l2[0]; }
+_CLC_INLINE size_t get_global_size (uint dim) { return kernel_config_l2[1+dim]; }
+_CLC_INLINE size_t get_local_size (uint dim) { return kernel_config_l2[4+dim]; }
+_CLC_INLINE size_t get_global_offset(uint dim) { return kernel_config_l2[7+dim]; }
+_CLC_INLINE size_t __get_global_first(uint dim) { return kernel_config_l2[10+dim]; }
+_CLC_INLINE size_t get_num_groups (uint dim) { return get_global_size(dim) / get_local_size(dim); }
+_CLC_INLINE size_t get_global_id (uint dim) { return __get_global_first(dim) + get_local_id(dim); }
+_CLC_INLINE size_t get_group_id (uint dim)
+ { return (__get_global_first(dim) - get_global_offset(dim)) / get_local_size(dim); }
+
+#endif //_DSP_CLC_H_
+
diff --git a/init/Makefile b/init/Makefile
new file mode 100644
index 0000000..3d29b1e
--- /dev/null
+++ b/init/Makefile
@@ -0,0 +1,25 @@
+PDK= /opt/ti/pdk_C6678_1_1_2_5
+SDK= /opt/ti/desktop-linux-sdk_01_00_00_07
+CC= $(CG)/bin/cl6x -g -k -mv6600 --abi=eabi --mem_model:const=data
+
+INCLUDES= -i $(PDK)/packages/ti/platform \
+ -i $(PDK)/packages \
+ -i $(CG)/include \
+ -i $(SDK)
+
+LIBS = $(CG)/lib/libc.a init.cmd
+
+all: init_dspc8681.out init_dspc8682.out
+
+init_dspc8681.out: main.obj
+ @$(CC) -z -w -x --rom_model $^ $(PDK)/packages/ti/platform/dspc8681/platform_lib/lib/debug/ti.platform.dspc8681.lite.lib $(LIBS) -o $@ -m init_dspc8681.map
+
+init_dspc8682.out: main.obj
+ @$(CC) -z -w -x --rom_model $^ $(PDK)/packages/ti/platform/dspc8682/platform_lib/lib/debug/ti.platform.dspc8682.lite.lib $(LIBS) -o $@ -m init_dspc8682.map
+
+%.obj : %.c
+ @$(CC) $(INCLUDES) -c $<
+
+.PHONY: clean
+clean:
+ @rm -fr *.obj *.out *.map
diff --git a/init/init.cmd b/init/init.cmd
new file mode 100644
index 0000000..6ca2178
--- /dev/null
+++ b/init/init.cmd
@@ -0,0 +1,83 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014 Texas Instruments Incorporated - http://www.ti.com
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *****************************************************************************/
+-cr
+-heap 0x4000
+-stack 0x4000
+
+MEMORY
+{
+ L2SRAM (RWX) : org = 0x0860000, len = 0x10000
+
+ /*-------------------------------------------------------------------------
+ *------------------------------------------------------------------------*/
+ //L2SRAM_INIT(RWX) : org = 0x086FF00, len = 0x100
+
+ /*-------------------------------------------------------------------------
+ * reserved for the boot magic address. Ensures nothing near it so that
+ * any false sharing is avoided
+ *------------------------------------------------------------------------*/
+ L2SRAM_BOOT(RWX) : org = 0x087FF00, len = 0x100, fill = 0xFFFFFFFF
+}
+
+SECTIONS
+{
+ .c_int00 > 0x860000
+
+ /*-------------------------------------------------------------------------
+ * Boot configuration area. Host must know address.
+ *------------------------------------------------------------------------*/
+ .init_config > 0x86FF00, type NOINIT, palign(0x100)
+
+ .csl_vect > L2SRAM
+ .version > L2SRAM
+ platform_lib > L2SRAM
+ .text > L2SRAM
+
+ GROUP (NEAR_DP)
+ {
+ .neardata
+ .rodata
+ .bss
+ } load > L2SRAM
+
+ .stack > L2SRAM
+ .cinit > L2SRAM
+ .cio > L2SRAM
+ .const > L2SRAM
+ .data > L2SRAM
+ .switch > L2SRAM
+ .sysmem > L2SRAM
+ .far > L2SRAM
+ .testMem > L2SRAM
+ .fardata > L2SRAM
+}
diff --git a/init/main.c b/init/main.c
new file mode 100644
index 0000000..f53f9c4
--- /dev/null
+++ b/init/main.c
@@ -0,0 +1,485 @@
+/*
+ *
+ * Copyright (C) 2012-2014 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * Neither the name of Texas Instruments Incorporated nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <platform.h>
+
+#include <ti/csl/csl_xmcAux.h>
+#include <ti/csl/csl_cpsgmiiAux.h>
+#include <ti/csl/cslr_cpsgmii.h>
+#include <ti/csl/csl_cpsw.h>
+#include <ti/csl/csl_cacheAux.h>
+
+#include "sdk/inc/initcfg.h"
+
+#define BOOT_MAGIC_ADDR 0x87FFFC // boot add - end of L2
+#define BOOT_MAGIC_CONTENTS (*((unsigned int *)BOOT_MAGIC_ADDR))
+#define DEF_INIT_CONFIG_UART_BAUDRATE 115200
+#define BOOTROM_RESET_VECTOR_LOCATION 0x20b0fc00
+#define DEVICE_INTCTL_BASE 0x1800000
+
+/* The mux registers are indexed from 1, not 0 */
+#define INTCTL_REG_MUX(x) (0x104 + (4*((x)-1)))
+#define DEVICE_REG32_W(x,y) *(volatile uint32_t *)(x)=(y)
+#define DEVICE_REG32_R(x) (*(volatile uint32_t *)(x))
+
+/****************************************************************************
+ * Generic bit extraction macros
+ ****************************************************************************/
+#define BOOTBITMASK(x,y) (((((uint32_t)1 << \
+ (((uint32_t)x)-((uint32_t)y)+(uint32_t)1)) - (uint32_t)1)) << ((uint32_t)y))
+
+#define BOOT_READ_BITFIELD(z,x,y) (((uint32_t)z) & BOOTBITMASK(x,y)) >> (y)
+#define BOOT_SET_BITFIELD(z,f,x,y) (((uint32_t)z) & ~BOOTBITMASK(x,y)) | \
+ ((((uint32_t)f) << (y)) & BOOTBITMASK(x,y))
+
+#define DEVICE_INT_IPC 91
+#define DEVICE_CACHE_BASE 0x1840000
+#define CACHE_REG_L2WBIV 0x5004
+#define CACHE_REG_L1PINV 0x5028
+#define CACHE_REG_L1DWBIV 0x5044
+
+extern volatile unsigned int cregister ICR;
+extern volatile unsigned int cregister IFR;
+extern volatile unsigned int cregister ISTP;
+extern volatile unsigned int cregister CSR;
+extern volatile unsigned int cregister IER;
+extern volatile unsigned int cregister DNUM;
+extern volatile unsigned int cregister FADCR;
+extern volatile unsigned int cregister FMCR;
+
+#define TI667X_IRQ_EOI 0x21800050 // End of Interrupt Register
+#define LEGACY_A_IRQ_STATUS_RAW 0x21800180 // Raw Interrupt Status Register
+#define LEGACY_A_IRQ_STATUS 0x21800184 // Interrupt Enabled Status Register
+
+#define IPCGR(x) (0x02620240 + x*4)
+#define IPCAR(x) (0x02620280 + x*4)
+
+/* Boot time init configuration */
+#pragma DATA_SECTION(init_config,".init_config");
+dsp_init_cfg_t init_config;
+
+/** ============================================================================
+ * @n@b Init_Switch
+ *
+ * @b Description
+ * @n This API sets up the ethernet switch subsystem and its Address Lookup
+ * Engine (ALE) in "Switch" mode.
+ *
+ * @param[in]
+ * @n mtu Maximum Frame length to configure on the switch.
+ *
+ * @return
+ * @n None
+ * =============================================================================
+ */
+void Init_Switch (uint32_t mtu)
+{
+ CSL_CPSW_3GF_PORTSTAT portStatCfg;
+ uint32_t rx_max_len = mtu + 14 + 4; /* 4 bytes of FCS */
+ CSL_CPSW_3GF_ALE_PORTCONTROL alePortControlCfg;
+
+ /* Enable the CPPI port, i.e., port 0 that does all
+ * the data streaming in/out of EMAC.
+ */
+ CSL_CPSW_3GF_enablePort0 ();
+ CSL_CPSW_3GF_disableVlanAware ();
+ CSL_CPSW_3GF_setPort0VlanReg (0, 0, 0);
+ CSL_CPSW_3GF_setPort0RxMaxLen (rx_max_len);
+
+ /* Enable statistics on both the port groups:
+ *
+ * MAC Sliver ports - Port 1, Port 2
+ * CPPI Port - Port 0
+ */
+ portStatCfg.p0AStatEnable = 1;
+ portStatCfg.p0BStatEnable = 1;
+ portStatCfg.p1StatEnable = 1;
+ portStatCfg.p2StatEnable = 1;
+ CSL_CPSW_3GF_setPortStatsEnableReg (&portStatCfg);
+
+ /* Setup the Address Lookup Engine (ALE) Configuration:
+ * (1) Enable ALE.
+ * (2) Clear stale ALE entries.
+ * (3) Disable VLAN Aware lookups in ALE since
+ * we are not using VLANs by default.
+ * (4) No Flow control
+ * (5) Configure the Unknown VLAN processing
+ * properties for the switch, i.e., which
+ * ports to send the packets to.
+ */
+ CSL_CPSW_3GF_enableAle ();
+ CSL_CPSW_3GF_clearAleTable ();
+ CSL_CPSW_3GF_disableAleVlanAware ();
+ CSL_CPSW_3GF_disableAleTxRateLimit ();
+
+ /* Setting the Switch MTU Size to more than needed */
+ CSL_CPGMAC_SL_setRxMaxLen(0, rx_max_len);
+ CSL_CPGMAC_SL_setRxMaxLen(1, rx_max_len);
+
+
+ /* Configure the address in "Learning"/"Forward" state */
+ alePortControlCfg.portState = ALE_PORTSTATE_FORWARD;
+ alePortControlCfg.dropUntaggedEnable = 0;
+ alePortControlCfg.vidIngressCheckEnable = 0;
+ alePortControlCfg.noLearnModeEnable = 0;
+ alePortControlCfg.mcastLimit = 0;
+ alePortControlCfg.bcastLimit = 0;
+
+ CSL_CPSW_3GF_setAlePortControlReg (0, &alePortControlCfg);
+ CSL_CPSW_3GF_setAlePortControlReg (1, &alePortControlCfg);
+ CSL_CPSW_3GF_setAlePortControlReg (2, &alePortControlCfg);
+
+#ifdef SIMULATOR_SUPPORT
+ CSL_CPSW_3GF_enableAleBypass();
+#endif
+ /* Done with switch configuration */
+ return;
+}
+
+/******************************************************************************
+* Init_MAC
+******************************************************************************/
+void Init_MAC(uint32_t macPortNum)
+{
+ CSL_CPGMAC_SL_resetMac (macPortNum);
+ while (CSL_CPGMAC_SL_isMACResetDone (macPortNum) != TRUE);
+
+ /* Setup the MAC Control Register for this port:
+ * (1) Enable Full duplex
+ * (2) Enable GMII
+ * (3) Enable Gigabit
+ * (4) Enable External Configuration. This enables
+ * the "Full duplex" and "Gigabit" settings to be
+ * controlled externally from SGMII
+ * (5) Don't Enable any control/error frames
+ * (6) Enable short frames
+ */
+ CSL_CPGMAC_SL_enableFullDuplex (macPortNum);
+ CSL_CPGMAC_SL_enableGMII (macPortNum);
+ CSL_CPGMAC_SL_enableGigabit (macPortNum);
+ CSL_CPGMAC_SL_enableExtControl (macPortNum);
+}
+
+/******************************************************************************
+* flushCache
+******************************************************************************/
+static void flushCache (void)
+{
+ uint32_t key;
+
+ /* Disable Interrupts */
+ key = _disable_interrupts();
+ CSL_XMC_invalidatePrefetchBuffer();
+
+ /*-------------------------------------------------------------------------
+ * Also flushes L1P and L1D.
+ *------------------------------------------------------------------------*/
+ CACHE_wbInvAllL2(CACHE_NOWAIT);
+
+ _mfence();
+ asm(" NOP 9");
+ asm(" NOP 7");
+
+ /* Reenable Interrupts. */
+ _restore_interrupts(key);
+}
+
+/******************************************************************************
+* hwWbInvL1DInline
+******************************************************************************/
+void inline hwWbInvL1DInline(void)
+{
+ DEVICE_REG32_W (DEVICE_CACHE_BASE + CACHE_REG_L1DWBIV, 1);
+}
+
+/*******************************************************************************
+ * FUNCTION PURPOSE: Setup a mux value
+ *******************************************************************************
+ * DESCRIPTION: Configures one of the 12 interrupt mux values. Original value
+ * is lost.
+ ******************************************************************************/
+void hwIntctlRoute (uint32_t vector, uint32_t eventNum)
+{
+ uint32_t muxp;
+ uint32_t muxv;
+ uint32_t base;
+
+ if (vector > 3 && vector < 8)
+ muxp = 1;
+ else if (vector < 12)
+ muxp = 2;
+ else if (vector < 16)
+ muxp = 3;
+ else
+ return; /* Invalid vector */
+
+ /* Which of the four events in each register (0-3) is determined by the the
+ * two lsbs of the vector number. The least significant bit of the mux
+ * valud (0, 8, 16, or 24) is found. */
+ base = (vector & 0x3) * 8;
+
+ /* Read the active mux, overwrite the event num with the desired value */
+ muxv = DEVICE_REG32_R (DEVICE_INTCTL_BASE + INTCTL_REG_MUX(muxp));
+ muxv = BOOT_SET_BITFIELD (muxv, eventNum, base+7, base);
+ DEVICE_REG32_W (DEVICE_INTCTL_BASE + INTCTL_REG_MUX(muxp), muxv);
+
+} /* hwIntctlRoute */
+
+
+
+
+/******************************************************************************
+ * FUNCTION PURPOSE: Idle
+ ******************************************************************************
+ * DESCRIPTION: The IPC interrupt is routed to the core, and idle is executed
+ ******************************************************************************/
+void idle_till_wakeup (void)
+{
+ uint32_t csrReg;
+
+ /* Clear any pending interrupts and route the IPC interrupt to vector 4 */
+ ICR = 0x0013;
+ hwIntctlRoute (4, DEVICE_INT_IPC);
+
+ IER = (1 << 4) | 3;
+
+ /* Point the ISTP to the ROM vector table */
+ ISTP = (uint32_t)BOOTROM_RESET_VECTOR_LOCATION;
+
+ /* Write back/invalidate L1D. Use inline since the magic address
+ * may live on the same cache line as the current stack */
+ hwWbInvL1DInline();
+
+ /* Globally enable interrupts */
+ csrReg = CSR | 1;
+ CSR = csrReg;
+
+ asm(" NOP 4 ");
+ asm(" IDLE ");
+ asm(" NOP 4 ");
+
+ /* On wakeup disable interrupts and restore the system state */
+ csrReg = CSR & ~1;
+ CSR = csrReg;
+ IER &= (~(1 << 4));
+ ICR = 0x0013;
+}
+
+/******************************************************************************
+* wait_for_interrupt
+******************************************************************************/
+void wait_for_interrupt()
+{
+ if (DNUM == 0)
+ {
+ /* clear PCIe interrupt A */
+ *((unsigned int *)LEGACY_A_IRQ_STATUS) = 0x1;
+ *((unsigned int *)TI667X_IRQ_EOI) = 0x0;
+
+ while (*((unsigned int *)LEGACY_A_IRQ_STATUS_RAW) != 0x1);
+
+ /* clear PCIe interrupt A */
+ *((unsigned int *)LEGACY_A_IRQ_STATUS) = 0x1;
+ *((unsigned int *)TI667X_IRQ_EOI) = 0x0;
+ }
+ else idle_till_wakeup();
+}
+
+
+
+/*-----------------------------------------------------------------------------
+* Allocate memory for the system stack. The section will be sized by the linker.
+*----------------------------------------------------------------------------*/
+__asm("\t.global __TI_STACK_END");
+__asm("\t.global __TI_STATIC_BASE");
+
+#pragma DATA_ALIGN (_stack, 8);
+#pragma DATA_SECTION (_stack, ".stack");
+char _stack[8];
+
+/*-----------------------------------------------------------------------------
+* We will place the c_int00 entry point into its own section so that a linker
+* command file can guarantee its placement.
+*----------------------------------------------------------------------------*/
+#pragma CODE_SECTION (_c_int00, ".c_int00");
+
+/*****************************************************************************/
+/* C_INT00() - C ENVIRONMENT ENTRY POINT */
+/*****************************************************************************/
+extern void __interrupt _c_int00()
+{
+ /*-------------------------------------------------------------------------
+ * After a reset, the device should have invalidated caches. The caches will
+ * still be configured as they were prior to the reset. Since this code was
+ * loaded into L2, we will ensure that L2 is configured as all sram.
+ *------------------------------------------------------------------------*/
+ CACHE_setL2Size (CACHE_0KCACHE);
+ CACHE_setL1DSize(CACHE_L1_32KCACHE);
+ CACHE_setL1PSize(CACHE_L1_32KCACHE);
+
+ /*-------------------------------------------------------------------------
+ * Set up the stack pointer in b15.
+ * The stack pointer points 1 word past the top of the stack, so subtract
+ * 1 word from the size. also the sp must be aligned on an 8-byte boundary
+ *------------------------------------------------------------------------*/
+ __asm("\t MVKL\t\t __TI_STACK_END - 4, SP");
+ __asm("\t MVKH\t\t __TI_STACK_END - 4, SP");
+ __asm("\t AND\t\t ~7,SP,SP");
+
+ /*-------------------------------------------------------------------------
+ * Set up the global data page pointer in b14.
+ *------------------------------------------------------------------------*/
+ __asm("\t MVKL\t\t __TI_STATIC_BASE,DP");
+ __asm("\t MVKH\t\t __TI_STATIC_BASE,DP");
+
+ /*-------------------------------------------------------------------------
+ * disable cache for all addrs over 0x1000:0000
+ *------------------------------------------------------------------------*/
+ memset((void*)0x01848040, 0, 960);
+
+ /*-------------------------------------------------------------------------
+ * disable mpax registers 3 and above
+ *------------------------------------------------------------------------*/
+ memset((void*)0x08000018, 0, 104);
+
+ /*-------------------------------------------------------------------------
+ * disable msmc ses mpax registers except for the first one at each pri lev
+ *------------------------------------------------------------------------*/
+ if (DNUM == 0)
+ {
+ int i;
+ for (i=0; i < 16; i++)
+ memset((void*)(0x0bc00600 + (i * 0x40) + 8), 0, 0x38);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Set up floating point registers
+ *------------------------------------------------------------------------*/
+ FADCR = 0; FMCR = 0;
+
+ /*-------------------------------------------------------------------------
+ * Setup platform specifics, i.e. uarts, ethernet, etc.
+ *------------------------------------------------------------------------*/
+ if (DNUM == 0)
+ {
+ /*---------------------------------------------------------------------
+ * Check if Boot time init configuration is loaded
+ *
+ * This code is reading l2 memory written by the host. The values were
+ * written before this code began running, so the cache invalidate at
+ * reset should ensure that when we read these values we will miss l1 and
+ * read directly from l2.
+ *--------------------------------------------------------------------*/
+ platform_init_config config;
+ config.pllm = 0; // Original configuraion : default 0 -> 1 GHz
+ if (init_config.magic_number == 0xBABEFACE)
+ config.pllm = init_config.dsp_pll_multiplier;
+
+ /* Platform initialization */
+ platform_init_flags flags;
+ flags.pll = 0x1;
+ flags.ddr = 0x1;
+ flags.tcsl = 0x1;
+ flags.phy = 0x0;
+ flags.ecc = 0x1;
+
+ platform_init(&flags, &config);
+ platform_uart_init();
+ platform_uart_set_baudrate(DEF_INIT_CONFIG_UART_BAUDRATE);
+
+ memset((void*)&flags, 0 , sizeof(platform_init_flags));
+ memset((void*)&config, 0, sizeof(platform_init_config));
+
+ flags.pll = 0;
+ flags.ddr = 0;
+ flags.tcsl = 1;
+ flags.phy = 1;
+ flags.ecc = 0;
+ platform_init(&flags, &config);
+
+ Init_MAC(0);
+ Init_MAC(1);
+
+ Init_Switch(1506);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Once we write 0 to the boot magic addr, the host can proceed with the
+ * loading of another program that will subsequently run. We should ensure
+ * that the caches are written back and clean at this point so that a
+ * subsequent writeback opertation will not clobber the program loaded
+ * from the host. It is also clearly important that the loaded program
+ * did not write over the L2 area containing the remainder of this code.
+ *------------------------------------------------------------------------*/
+ BOOT_MAGIC_CONTENTS = 0;
+ CACHE_wbInvL1d((void*)(BOOT_MAGIC_ADDR & ~0x3f), 64, CACHE_WAIT);
+
+ wait_for_interrupt();
+
+ /*-------------------------------------------------------------------------
+ * We will now wait until an external entity writes the address to which we
+ * should jump
+ *------------------------------------------------------------------------*/
+ while (1)
+ {
+ /*---------------------------------------------------------------------
+ * invalidate the address so that we pick up the actual memory written by
+ * the external entity.
+ *--------------------------------------------------------------------*/
+ CACHE_invL1d((void*)(BOOT_MAGIC_ADDR & ~0x3f), 64, CACHE_WAIT);
+ void (*entry)() = (void (*)())(BOOT_MAGIC_CONTENTS);
+
+ /*---------------------------------------------------------------------
+ * If we have a non null pointer then we will branch to it. This
+ * essentially marks the end of this routine and the start of another,
+ * so we should ensure that the caches are written back and clean at
+ * this point so that the following program starts with a clean cache
+ * system.
+ *
+ * It is also clearly important that the loaded program did not write
+ * over the L2 area containing this reset code. Since this reset code
+ * is entirely resident in the last 1/4 of L2, this area should not
+ * contain initialized data or code in the following program.
+ *--------------------------------------------------------------------*/
+ if (entry)
+ {
+ flushCache();
+ (*entry)();
+ }
+ }
+}
diff --git a/init_global_shared_mem/Makefile b/init_global_shared_mem/Makefile
new file mode 100644
index 0000000..da37749
--- /dev/null
+++ b/init_global_shared_mem/Makefile
@@ -0,0 +1,29 @@
+# Compiler name (with path, if not in path)
+CC=gcc -m32
+
+LIBS=${SDK}/sdk/dnldmgr/lib/dnldmgr.a \
+ ${SDK}/sdk/pciedrv/lib/pciedrv.a \
+ ${SDK}/sdk/cmem/lib/cmem_drv.a \
+ ${SDK}/sdk/sync/lib/sync.a
+
+# Mainly used for include paths
+INCLUDES=-I ${SDK}/sdk\
+ -I ${SDK}/sdk/pciedrv\
+ -I ${SDK}/sdk/dnldmgr\
+ -I inc\
+ -I ${SDK}/sdk/cmem
+
+LDFLAGS=-lpciaccess -lpthread -lbfd
+
+SOURCES= init_global_shared_mem.c
+OBJECTS= init_global_shared_mem.o
+
+init_global_shared_mem: $(OBJECTS) $(LIBS)
+ $(CC) $(OBJECTS) $(LIBS) -static -o $@ $(LDFLAGS)
+
+%.o : %.c
+ $(CC) $(INCLUDES) $(DEFINES) -c $< -o $@
+
+clean:
+ @rm -fr *.o init_global_shared_mem
+
diff --git a/init_global_shared_mem/README b/init_global_shared_mem/README
new file mode 100644
index 0000000..cc147c7
--- /dev/null
+++ b/init_global_shared_mem/README
@@ -0,0 +1 @@
+Must be built on a 32-bit machine and statically installed in the bin directory
diff --git a/init_global_shared_mem/init_global_shared_mem.c b/init_global_shared_mem/init_global_shared_mem.c
new file mode 100644
index 0000000..b511afd
--- /dev/null
+++ b/init_global_shared_mem/init_global_shared_mem.c
@@ -0,0 +1,73 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <string.h>
+#include <stdio.h>
+#include "stdint.h"
+#include "pciedrv.h"
+#include "cmem_drv.h"
+
+int main()
+{
+ cmem_host_buf_desc_t buf_desc[32];
+ uint32_t dsp_start_addr;
+ int size_of_buffer = 0x400000;
+
+ int ret = cmem_drv_open();
+ if (ret) { printf("\nERROR: dma mem driver open failed \n"); return(-1); }
+
+ ret = cmem_drv_alloc(0, size_of_buffer, HOST_BUF_TYPE_PERSISTENT, buf_desc);
+ if (ret) { printf("ERROR: contiguous memory alloc failed\n"); return -1; }
+
+ /*-------------------------------------------------------------------------
+ * configuration for pcie driver
+ *------------------------------------------------------------------------*/
+ pciedrv_open_config_t pciedrv_open_config;
+ memset(&pciedrv_open_config, 0 , sizeof(pciedrv_open_config_t));
+ pciedrv_open_config.dsp_outbound_block_size = size_of_buffer;
+
+ ret = pciedrv_open(&pciedrv_open_config);
+ if (ret) { printf("ERROR: pciedrv could not opened\n"); return -1; }
+
+ /*-------------------------------------------------------------------------
+ * Allocate dsp memory range
+ *------------------------------------------------------------------------*/
+ ret = pciedrv_dsp_memrange_alloc(0, 0, &dsp_start_addr);
+ if (ret) { printf("ERROR: memrange alloc failed \n"); return -1; }
+
+ /*-------------------------------------------------------------------------
+ * Map host buffer to dsp memory range
+ *------------------------------------------------------------------------*/
+ ret = pciedrv_map_hostbufs_to_dsp_memrange(0, 0, buf_desc, dsp_start_addr);
+ if (ret) { printf("ERROR: map dsp mem range failed \n"); return -1; }
+
+ ret = cmem_drv_close();
+ if (ret) { printf("ERROR: dma mem driver could not closed\n"); return -1; }
+
+ ret = pciedrv_close();
+ if (ret) { printf("ERROR: pciedrv could not closed\n"); return -1; }
+}
diff --git a/opencl-manifest.docx b/opencl-manifest.docx
new file mode 100644
index 0000000..1633bdb
--- /dev/null
+++ b/opencl-manifest.docx
Binary files differ
diff --git a/opencl-manifest.pdf b/opencl-manifest.pdf
new file mode 100755
index 0000000..f7acaca
--- /dev/null
+++ b/opencl-manifest.pdf
Binary files differ
diff --git a/readme_shannon.txt b/readme_shannon.txt
new file mode 100644
index 0000000..565f27a
--- /dev/null
+++ b/readme_shannon.txt
@@ -0,0 +1,369 @@
+*-----------------------------------------------------------------------------
+Open CL(TM) 1.1 Product Version 0.4.0, from Texas Instruments, Inc.
+*-----------------------------------------------------------------------------
+
+*-----------------------------------------------------------------------------
+* INSTALLATION
+*-----------------------------------------------------------------------------
+1) This installation is not setup to coexist with other versions of this
+ product. This is due to environment variables that point into the
+ installation. Therefore, you should uninstall all previous versions of the
+ TI OpenCL product before installation of this version.
+
+2) The installation program modifies your shell's .rc file (e.g. .bashrc,
+ .tcshrc, .cshrc, etc) to create or append to three environment variables,
+ PATH, LD_LIBRARY_PATH, TI_OCL_INSTALL. You will need to re-source that .rc
+ file for the changes to take effect.
+
+3) The OpenCL product or some of the examples in the product are dependent on
+ Ubuntu packages that are not typically installed by default. This step will
+ install these packages and will require sudo privileges or an administrator
+ to execute. Execute the following commands:
+
+ sudo apt-get install libpciaccess-dev binutils-dev ia32-libs libsdl1.2-dev
+ sudo apt-get install mesa-common-dev
+
+Note 1) The OpenCL product is dependent on a kernel module that allows for
+ contiguous memory allocation on the Linux host. Kernel modules can be
+ specific to the Linux kernel version you are running. This package
+ contains the source for the module and is custom built for your linux
+ version as part of the installation process. The install (and uninstall)
+ package does require sudo privileges for portions of the install process
+ and will request an administrator password. The scripts
+ $TI_OCL_INSTALL/scripts/install.sh and $TI_OCL_INSTALL/scripts/uninstall.sh
+ can be inspected for details on the commands that are run and require sudo
+ privilege.
+
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.6 from 0.1.5
+*-----------------------------------------------------------------------------
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.5 from 0.1.4
+*-----------------------------------------------------------------------------
+- More reliable installation and uninstallation of the cmem module
+
+- Updated the C66 compiler tools to be based on version 7.5.0A13072
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.4 from 0.1.3
+*-----------------------------------------------------------------------------
+- Updated to use the TI Desktop Linux SDK version 01.00.00.07
+
+- Updated internal LLVM usage from version 3.0 to 3.2 libraries
+
+- More accurate handling of the DSPC8682
+
+- Increased the OpenCL global buffer area from 992M to 1023M
+
+- Increased the OpenCL local buffer area from 128K to 256K
+
+- The default speed of the DSP was modified from 1.25 Ghz to 1.00 Ghz.
+ This was due to the fact that most of the devices on the Advantech cards
+ are qualified for only 1.00 Ghz and some instability was seen running at
+ 1.25 Ghz. See below for an environment variable you can set that will
+ change the DSP speed back to 1.25 GHz.
+
+- Added logic to reset certain persistent configurations of the DSP device
+ that could cause incorrect behavior when intermixing the run of an opencl
+ application with a non opencl application using the DSP devices in
+ a conflicting manner.
+
+- General bug fixes and stability improvements.
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.3 from 0.1.2
+*-----------------------------------------------------------------------------
+- Updated to use all DSPs found in the PCIe subsystem. Previously it was
+ fixed to 4 dsps. This should allow multiple cards to be discovered and
+ used. It should also allow all 8 dsps on an octal card to be found.
+ Note: The octal card setup had not been tested.
+
+- Updated the mandelbrot demo to use an image size of 720 instead of 500. 720
+ was chosen because it is divisible by 4, 5, 8, 9, and 16 which allows for a
+ simple division of labor across a number of configurations. It was also
+ updated to print the names of all devices being used for pixel generation.
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.2 from 0.1.1
+*-----------------------------------------------------------------------------
+- The DSP compiler would sometimes fail to compile OpenCL C code that included
+ vector types, ie float2, int3, etc. This bug has been resolved.
+
+- Certain OpenCL C kernels would cause a segmentation fault in the dynamic
+ loader in the OpenCL library. The gdb stack dump would list the function
+ process_rela_table.isra.7 as the faulting function. This bug has been
+ resolved.
+
+- Version 0.1.1 would not allow local (__local) address qualified variables to
+ be defined in function scope. This bug has been resolved.
+
+- The OpenCL C as_<type> builtins have been added.
+
+- Version 0.1.2 will introduce a 32bit version of the library.
+
+*-----------------------------------------------------------------------------
+* SUMMARY OF DELTAS 0.1.1 from 0.1.0
+*-----------------------------------------------------------------------------
+- Stability improvements.
+- Install and uninstall improvements. The product is no longer dependent on a
+ specific linux kernel version. It does still depend on a kernel module, but
+ the source for that kernel module is shipped with the installation packages
+ and is made on the users' machine.
+- Added example simple.
+
+*-----------------------------------------------------------------------------
+* PRODUCT DESCRIPTION
+*-----------------------------------------------------------------------------
+This product is an OpenCL 1.1 implementation. The OpenCL specification
+defines a platform model with a HOST and COMPUTE DEVICES. For this
+implementation the HOST is a 64-bit x86 Linux machine and the COMPUTE DEVICES
+are 4 Texas Instruments' TMS320C6678 DSP's resident on a PCIe card installed
+in the Linux machine. The x86_64 cpu is also exposed as a fifth compute device
+in this implementation.
+
+*-----------------------------------------------------------------------------
+* HARDWARE AND OS REQUIREMENTS
+*-----------------------------------------------------------------------------
+- Ubuntu 12.04 installation running on an x86 machine.
+- An installed Advantech DSPC8681 quad DSP PCIe card configured for little
+ endian operation.
+
+*-----------------------------------------------------------------------------
+* OPENCL DOCUMENTATION
+*-----------------------------------------------------------------------------
+The OpenCL 1.1 specification and the 1.1 C++ bindings specification from
+Khronos are included in $(TI_OCL_INSTALL)/doc.
+
+Additional OpenCL resources can be found on the web. Some links are provided
+below.
+
+The OpenCL 1.1 on-line manual pages can be found at:
+ http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/
+
+The following page contains links to other OpenCL resources, including books
+that may be helpful to you:
+ http://www.khronos.org/opencl/resources
+
+*-----------------------------------------------------------------------------
+* LIMITATIONS
+*-----------------------------------------------------------------------------
+
+- This is an early alpha version of this product. It is complete enough to be
+ useful under some circumstances and as such we would like to get feedback
+ from a select number of early adopters. However, it is by no means a
+ complete or compliant implementation. Taking an existing OpenCL application
+ and linking it against this implementation will not likely produce expected
+ results. Some of the major incomplete areas of the implementation are listed
+ below. The below list is not a complete list of limitations.
+
+- OpenCL C is not yet fully supported. In particular,
+ - Kernel arguments less than 32 bits in size cannot be passed to kernels.
+ - There is a limit of 10 arguments that may be passed to kernels.
+ - Structures may not be passed as arguments to kernels.
+ - Only a few OpenCL C built-in functions are supported.
+ - The math fcns that are also part of the std C library are supported.
+ - The work group identification functions are supported, i.e.
+ get_global_id(), get_local_id(), get global_size(), etc.
+ - The barrier and synchronization functions are not supported.
+
+- OpenCL Out of order Queues (OOOQs) are not yet supported. OOOQs allow
+ enqueued kernels to be serviced before a prior kernel is completed. This
+ behavior is particularly beneficial if you enqueue tasks rather than
+ NDRangekernels. When OOOQs are supported up to 8 enqueued tasks can be in
+ flight simultaneously per DSP device. OOOQs will also allow overlapped I/O
+ and compute operations allowing a double buffering or pipeline operation.
+ For this alpha, only one operation at a time is active within a Queue.
+
+- An OpenCL ICD (Installable Client Driver) is provided with this product, but
+ it will not discover the TI OpenCL implementation. The OpenCL ICD is a
+ standard OpenCL library that will discover all installed OpenCL
+ implementations on a system and will allow the application to choose a
+ platform and dispatch through that platform's implementation. The TI OpenCL
+ implementation is not yet ICD compatible and therefore will not be
+ discovered. The ICD library can however be used to discover and dispatch to
+ other vendor implementations.
+
+- The clEnqueueCreateBuffer flags CL_MEM_USE_HOST_MEMORY,
+ CL_MEM_ALLOC_HOST_MEMOY, CL_MEM_COPY_HOST_MEMORY are not yet implemented and
+ will simply be ignored.
+
+- The OpenCL clEnqueueMapBuffer and clEnqueueUnmapBuffer operations not yet
+ supported.
+
+- OpenCL Images and Samplers are optional features for non GPU devices and are
+ not supported for the DSP devices.
+
+- The OpenCL api allows for either on-line or off-line compilation of OpenCL C
+ kernels. This release only supports the on-line compilation mode for OpenCL C
+ code. As a result, clCreateProgramFromBinaries is not supported yet, nor is
+ querying OpenCL for the binaries associated with a Program object.
+
+ - Even though off-line compilation for OpenCL C code is not yet
+ supported, OpenCL C code can call standard C code functions and the
+ standard C code functions can be compiled off-line. An example
+ illustrating this flow is included in the examples sub-directory.
+ The standard C code functions that are called should not include
+ code that: resets the device, allocates memory blocks that may
+ conflict with the OpenCL runtime, change the cache configuration,
+ etc. OpenCL C code calling C++ code is not supported.
+
+ - Also, compilations of OpenCL C code are cached on the system. If you
+ run an OpenCL application that on-line compiles some OpenCL C code,
+ the resultant binaries are cached on the system and the next time
+ you run the opencl application, the compilation step is skipped and
+ the cached binaries are used. The caching only uses the OpenCL C
+ code and the compile options as a hash, so an example where the
+ OpenCL C code is calling a C function in a linked object file or
+ library and the object file or library is modified will result in an
+ execution of the OpenCL C linked against the older version of the
+ object. In this case you will need to clear the OpenCL C compile
+ cache, which can be accomplished with the command
+ "rm -f /tmp/opencl*".
+
+*-----------------------------------------------------------------------------
+* EXAMPLE OPERATION
+*-----------------------------------------------------------------------------
+
+There are several OpenCL examples shipped with the product. I'll explain the
+motivation behind each and the steps needed for execution.
+
+IMPORTANT NOTE: For any of these examples or any OpenCL code you write,
+execution of the code will sometimes appear to hang. This is due to a known
+issue in the first communication between the Host and the DSP. It occurs
+intermittently and will be fixed in later releases. There is a decription in
+the LIMITATIONS section of this readme describing workarounds for this problem.
+
+PLATFORM EXAMPLE
+----------------
+The platform example uses the OpenCL C++ bindings to discover key platform and
+device information from the OpenCL implementation and print it to the screen.
+
+To print the information from the TI OpenCL implementation:
+
+ 1. cd $TI_OCL_INSTALL/examples/platform
+ 2. make
+ 3. ./platform
+
+To print the information from the Any other vendors OpenCL implementation
+installed on the system:
+
+ 1. cd $TI_OCL_INSTALL/examples/platform
+ 2. make icd
+ 3. ./platform_icd
+
+The Makefile in this example directory also illustrates the difference between
+linking for the TI implementation of OpenCL and the ICD.
+
+SIMPLE EXAMPLE
+-------------
+This example simply illustrates the minimum steps needed to dispatch a kernel
+to one DSP device and read a buffer of data back.
+
+To run this example:
+ 1. cd $TI_OCL_INSTALL/examples/simple
+ 2. make
+ 3. ./simple
+
+
+MANDELBROT EXAMPLE
+------------------
+The mandelbrot example is a nicely visual OpenCL demo that uses OpenCL to
+generate the pixels of a mandelbrot set image. This example also use the C++
+OpenCL binding. The OpenCL kernels are repeatedly called generating images that are zoomed in from the previous image. This repeats until the zoom factor
+reaches 1E15 or essentially the resolution of a double floating point value.
+
+This example illustrates several key OpenCL features:
+ - It illustrates 4 OpenCL Q's tied to each of the 4 DSPs and a dispatch
+ structure that allows the 4 DSPs to cooperatively generate pixel data.
+ - It also illustrates the event wait feature of OpenCL.
+ - It illustrates the division of one time setup of OpenCL to the repetitive
+ enqueuing of kernels.
+ - It also illustrates the ease in which kernels can be shifted from one
+ device type to another.
+
+To run this demo:
+ 1. cd $TI_OCL_INSTALL/examples/mandelbrot
+ 2. make
+ 3. ./mandelbrot dsp
+ 4. ./mandelbrot cpu
+ 5. ./mandelbrot all
+
+Step 3 will run the pixel generating kernels on the DSPs.
+Step 4 will run the pixel generating kernels on all the CPU cores in the
+system.
+Step 5 will use both the DSPs and the CPU cores to generate the pixels.
+
+The makefile in this example is also ICD enabled. You can
+
+ 1. cd $TI_OCL_INSTALL/examples/mandelbrot
+ 2. make icd
+ 3. ./mandelbrot intel "If an Intel OpenCL implementation exists"
+ 4. ./mandelbrot nvidia "If an Nvidia OpenCL implementation exists"
+
+
+CCODE EXAMPLE
+-------------
+This example illustrates the TI extension to OpenCL that allows OpenCL C code
+to call standard C code that has been compiled off-line into an object file or
+static library. This mechanism can be used to allow optimized C or C callable
+assembly routines to be called from OpenCL C code. It can also be used to
+essentially dispatch a standard C function, by wrapping it with an OpenCL C
+wrapper. Calling C++ routines from OpenCL C is not yet supported. You should
+also ensure that the Standard C function and the call tree resulting from the
+standard C function do not allocate device memory, change the cache structure,
+or use any resources already being used by the OpenCL runtime.
+
+To run this example:
+ 1. cd $TI_OCL_INSTALL/examples/ccode
+ 2. make
+ 3. ./ccode
+
+*-----------------------------------------------------------------------------
+* ENVIRONMENT VARIABLES
+*-----------------------------------------------------------------------------
+TI_OCL_DSP_1_25GHZ: If this environment variable is set, then the DSPs will be
+ configured to run at 1.25Ghz instead of the standard 1.00
+ Ghz.
+
+TI_OCL_KEEP_FILES: When OpenCL C kernels are compiled for DSPs, they are
+ compiled to a binary .out file in the /tmp sub-directory.
+ They are then subsequently available for download to the
+ DSPs for running. The process of compiling generates
+ several intermediate files for each source file. The
+ OpenCL typically removes these temporary files. However,
+ it can sometimes be useful to inspect these files. This
+ environment variable can be set to instruct the runtime to
+ leave the temporary files in /tmp. This can be useful to
+ inspect the assembly file associated with the out file, to
+ see how well your code was optimized.
+
+TI_OCL_DEBUG_KERNEL: The TI IDE and debugger Code Composer Studio (CCS) is not
+ required for running OpenCL applications with this
+ product, but if you do have CCS installed and and emulator
+ connected to you PCIe card, you can set this environment
+ variable to enable assembly statement level debug of you
+ kernel. When set, this environment variable will instruct
+ the OpenCL runtime to pause before dispatch of a kernel.
+ While paused the runtime will display data to the user
+ indicating that a kernel dispatch is pending. It will
+ instruct the user to connect to the board through an
+ emulator and will display the appropriate breakpoint
+ address to used for the start of the kernel code. Having
+ CCS and the emulator insert itself into a running OpenCL
+ application can cause instability in the system in this
+ release and may require a power cycle to the board. Debug
+ capability has not been a focus for this alpha release and
+ will definitely improve in later releases. Setting up the
+ emulator and CCS is outside the scope of this readme. If
+ you do have those products, consult the documentation
+ specific to those products.
+
+*-----------------------------------------------------------------------------
+* NOTICES
+*-----------------------------------------------------------------------------
+
+* Product is based on a published Khronos Specification, and is expected to
+ pass the Khronos Conformance Testing Process. Current conformance status can
+ be found at www.khronos.org/conformance.
diff --git a/scripts/20-c6678.rules b/scripts/20-c6678.rules
new file mode 100644
index 0000000..39f0e69
--- /dev/null
+++ b/scripts/20-c6678.rules
@@ -0,0 +1,6 @@
+ACTION=="remove", GOTO="c6678_end"
+
+SUBSYSTEM=="pci", RUN+="/bin/sh /etc/udev/rules.d/c6678_udev.sh /sys/$env{DEVPATH}"
+SUBSYSTEM=="cmem", RUN+="/bin/chmod ugo+rw /dev/cmem"
+
+LABEL="c6678_end"
diff --git a/scripts/c6678_udev.sh b/scripts/c6678_udev.sh
new file mode 100644
index 0000000..e0d49a7
--- /dev/null
+++ b/scripts/c6678_udev.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+read vendor_value < $1/vendor
+read device_value < $1/device
+if [ "$vendor_value" = "0x104c" -a "$device_value" = "0xb005" ]
+then
+read switch_vendor_value < $1/../vendor
+read switch_device_value < $1/../device
+/usr/bin/find $1 -maxdepth 1 -name "resource*" -exec /bin/chmod ugo+rw {} +
+
+/bin/date >> /var/log/c6678_udev.log
+/bin/echo $1 : vendor:$vendor_value device:$device_value switch_vendor:$switch_vendor_value switch_device:$switch_device_value >>/var/log/c6678_udev.log
+fi
diff --git a/scripts/install.sh b/scripts/install.sh
new file mode 100755
index 0000000..3681ef0
--- /dev/null
+++ b/scripts/install.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+sudo -v
+
+#---------------------------------------------------------------------
+# Set variable to the installation path of the product
+#---------------------------------------------------------------------
+cd ..
+ocl_path=$(pwd)
+cd -
+
+#-----------------------------------------------------------------------------
+# Remember that this installation was first to install cmem, thus uninstall
+# will remove it
+#----------------------------------------------------------------------------*/
+rm -f $ocl_path/.install_log
+touch $ocl_path/.install_log
+sudo chmod ugo+rw $ocl_path/.install_log
+
+#---------------------------------------------------------------------
+# Load driver if not already installed
+#---------------------------------------------------------------------
+if [ ! -e /dev/cmem ]
+then
+sudo insmod $ocl_path/cmem/cmem_dev.ko
+echo "INSMOD $ocl_path/cmem/cmem_dev.ko" >> $ocl_path/.install_log
+fi
+
+#---------------------------------------------------------------------
+# Set permissions
+#---------------------------------------------------------------------
+sudo chmod ugo+rw /dev/cmem
+
+#---------------------------------------------------------------------
+# Copy cmem driver to kernel driver directory
+#---------------------------------------------------------------------
+kernel_name=$(uname -r)
+if [ ! -e /lib/modules/$kernel_name/kernel/drivers/cmem ]
+then
+sudo mkdir /lib/modules/$kernel_name/kernel/drivers/cmem
+echo "MKDIR /lib/modules/$kernel_name/kernel/drivers/cmem" >> $ocl_path/.install_log
+fi
+
+if [ ! -e /lib/modules/$kernel_name/kernel/drivers/cmem/cmem_dev.ko ]
+then
+sudo cp $ocl_path/cmem/cmem_dev.ko /lib/modules/$kernel_name/kernel/drivers/cmem
+echo "CP $ocl_path/cmem/cmem_dev.ko /lib/modules/$kernel_name/kernel/drivers/cmem" >> $ocl_path/.install_log
+fi
+
+cmem_hits=$(sudo grep -c "cmem_dev" /etc/modules)
+if [ $cmem_hits -eq 0 ]
+then
+sudo sed -i '$ a cmem_dev' /etc/modules
+echo "MODULE ADD cmem_dev to /etc/modules" >> $ocl_path/.install_log
+fi
+
+#---------------------------------------------------------------------
+#---------------------------------------------------------------------
+sudo depmod -a
+
+#---------------------------------------------------------------------
+# Set pcie window permissions
+#---------------------------------------------------------------------
+$ocl_path/bin/init_global_shared_mem
+
+#---------------------------------------------------------------------
+#copy files to udev area
+#---------------------------------------------------------------------
+if [ ! -e /etc/udev/rules.d/c6678_udev.sh ]
+then
+ echo "UDEV files copied to /etc/udev/rules.d" >> $ocl_path/.install_log
+ sudo cp $ocl_path/scripts/c6678_udev.sh /etc/udev/rules.d/.
+ sudo cp $ocl_path/scripts/20-c6678.rules /etc/udev/rules.d/.
+ sudo touch /var/log/c6678_udev.log
+ sudo chmod ugo+rw /etc/udev/rules.d/20-c6678.rules
+ sudo chmod ugo+x /etc/udev/rules.d/c6678_udev.sh
+ sudo chmod ugo+rw /var/log/c6678_udev.log
+fi
diff --git a/scripts/uninstall.sh b/scripts/uninstall.sh
new file mode 100755
index 0000000..863e9c3
--- /dev/null
+++ b/scripts/uninstall.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+sudo -v
+
+cd ..
+ocl_path=$(pwd)
+cd -
+
+initial_path=$(pwd)
+kernel_name=$(uname -r)
+
+if [ ! -e $ocl_path/.install_log ]
+then
+ echo "Install log not found"
+ exit
+fi
+
+INSMOD=$(sudo grep -c "INSMOD" $ocl_path/.install_log)
+if [ $INSMOD -ne 0 ]
+then
+ sudo rmmod cmem_dev
+fi
+
+MKDIR=$(sudo grep -c "MKDIR /lib/modules" $ocl_path/.install_log)
+if [ $MKDIR -ne 0 ]
+then
+ sudo rm -rf /lib/modules/$kernel_name/kernel/drivers/cmem
+fi
+
+MODULES_ADD=$(sudo grep -c "MODULE ADD" $ocl_path/.install_log)
+if [ $MODULES_ADD -ne 0 ]
+then
+ sudo sed -i '/cmem_dev/d' /etc/modules
+fi
+
+UDEV=$(sudo grep -c "UDEV" $ocl_path/.install_log)
+if [ $UDEV -ne 0 ]
+then
+ sudo rm /etc/udev/rules.d/20-c6678.rules
+ sudo rm /etc/udev/rules.d/c6678_udev.sh
+fi
+
+cd $initial_path
+rm $ocl_path/.install_log
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..ec5d309
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+CMakeFiles/
+cmake_install.cmake
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..7b60902
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,241 @@
+
+if (SHANNON_BUILD)
+ SET (TARGET_INCLUDES
+ ${PROJECT_SOURCE_DIR}/init
+ ${SDK}/sdk
+ ${SDK}/sdk/config
+ ${SDK}/sdk/pciedrv
+ ${SDK}/sdk/cmem
+ ${SDK}/sdk/bufmgr
+ ${SDK}/sdk/mailBox
+ ${SDK}/sdk/dnldmgr )
+# Cross-compiling needs additional paths to find target OS headers
+# and non-system headers found on the host (BOOST,GL)
+elseif (HAWKING_CROSS_COMPILE)
+ SET (TARGET_INCLUDES
+ ${CMAKE_FIND_ROOT_PATH}
+ ${HOST_USR_INCLUDE_PATH} )
+endif()
+
+
+include_directories (
+ ${PROJECT_SOURCE_DIR}/include
+ ${PROJECT_SOURCE_DIR}/src
+ ${LLVM_INCLUDE_DIR}
+ ${CLANG_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_BINARY_DIR}
+ ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD_API
+ ${PROJECT_SOURCE_DIR}/src/core/dsp/ocl_load/DLOAD
+ ${PROJECT_SOURCE_DIR}/src/llvmopencl
+ ${TARGET_INCLUDES}
+ )
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FILE_OFFSET_BITS=64")
+
+# bfd.h has a check to ensure that config.h is included
+# We don't require config.h (autotools) so we bypass this check by defining
+# PACKAGE, and PACKAGE_VERSION
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPACKAGE=${PROJECT_NAME} -DPACKAGE_VERSION=${${PROJECT_NAME}_VERSION}")
+
+# Toggle below if wanting to build with debug
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-inline -g")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -fno-inline -g")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+
+# Temporary to work around hyperlink problem
+set(CMAKE_C_FLAGS "${CMAKE_CFLAGS} -D__ARMv7 -DDEVICE_K2H")
+
+if (SHANNON_BUILD)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDSPC868X")
+endif()
+
+configure_file(core/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/core/config.h)
+
+set(COAL_SRC_FILES
+ api/api_command.cpp
+ api/api_device.cpp
+ api/api_event.cpp
+ api/api_kernel.cpp
+ api/api_platform.cpp
+ api/api_program.cpp
+ api/api_context.cpp
+ api/api_enqueue.cpp
+ api/api_flush.cpp
+ api/api_memory.cpp
+ api/api_profiling.cpp
+ api/api_sampler.cpp
+ api/api_gl.cpp
+
+ core/context.cpp
+ core/commandqueue.cpp
+ core/memobject.cpp
+ core/events.cpp
+ core/program.cpp
+ core/compiler.cpp
+ core/kernel.cpp
+ core/sampler.cpp
+ core/object.cpp
+ core/platform.cpp
+ core/icd.cpp
+ core/util.cpp
+
+ core/cpu/buffer.cpp
+ core/cpu/device.cpp
+ core/cpu/kernel.cpp
+ core/cpu/program.cpp
+ core/cpu/worker.cpp
+ core/cpu/builtins.cpp
+ core/cpu/sampler.cpp
+
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h
+)
+
+if (NOT SHAMROCK_BUILD)
+list (APPEND COAL_SRC_FILES
+ core/dsp/genfile_cache.cpp
+ core/dsp/program.cpp
+ core/dsp/wga.cpp
+ core/dsp/driver.cpp
+ core/dsp/buffer.cpp
+ core/dsp/device.cpp
+ core/dsp/kernel.cpp
+ core/dsp/worker.cpp
+
+ llvmopencl/AllocasToEntry.cc
+ llvmopencl/BarrierBlock.cc
+ llvmopencl/BarrierTailReplication.cc
+ llvmopencl/BreakConstantGEPs.cpp
+ llvmopencl/CanonicalizeBarriers.cc
+ llvmopencl/Flatten.cc
+ llvmopencl/GenerateHeader.cc
+ llvmopencl/ImplicitLoopBarriers.cc
+ llvmopencl/IsolateRegions.cc
+ llvmopencl/Kernel.cc
+ llvmopencl/LLVMUtils.cc
+ llvmopencl/LoopBarriers.cc
+ llvmopencl/ParallelRegion.cc
+ llvmopencl/PHIsToAllocas.cc
+ llvmopencl/TargetAddressSpaces.cc
+ llvmopencl/VariableUniformityAnalysis.cc
+ llvmopencl/WIVectorize.cc
+ llvmopencl/Workgroup.cc
+ llvmopencl/WorkItemAliasAnalysis.cc
+ llvmopencl/WorkitemHandler.cc
+ llvmopencl/WorkitemHandlerChooser.cc
+ llvmopencl/WorkitemLoops.cc
+ llvmopencl/WorkitemReplication.cc
+)
+endif(NOT SHAMROCK_BUILD)
+
+if (SHAMROCK_BUILD)
+add_subdirectory(builtins)
+endif()
+add_subdirectory(runtime)
+
+set(CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bsymbolic")
+add_library(OpenCL SHARED ${COAL_SRC_FILES})
+
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_impl.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/builtins_def.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_impl.h
+ PROPERTIES GENERATED 1)
+set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib_def.h
+ PROPERTIES GENERATED 1)
+
+add_dependencies(OpenCL generate_stdlib_c)
+
+if (NOT SHAMROCK_BUILD)
+add_dependencies(OpenCL generate_builtins)
+add_dependencies(OpenCL oclload)
+add_dependencies(OpenCL generate_dsp_builtins)
+add_subdirectory(core/dsp/ocl_load)
+else()
+add_dependencies(generate_builtin_lib generate_bc_files)
+add_dependencies(generate_stdlib_c generate_builtin_lib)
+endif (NOT SHAMROCK_BUILD)
+
+if (HAWKING_BUILD)
+ add_dependencies(OpenCL arm_clocl)
+endif()
+
+if (HAWKING_CROSS_COMPILE OR SHANNON_BUILD)
+ add_dependencies(OpenCL x86_clocl)
+endif()
+
+SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+
+SET_TARGET_PROPERTIES(OpenCL PROPERTIES
+ VERSION ${${PROJECT_NAME}_VERSION}
+ SOVERSION ${${PROJECT_NAME}_SOVERSION}
+)
+
+set_source_files_properties(${COAL_SRC_FILES}
+ PROPERTIES COMPILE_FLAGS ${LLVM_COMPILE_FLAGS})
+
+set_target_properties(OpenCL PROPERTIES
+ LINK_FLAGS "${LLVM_LDFLAGS}"
+ LINK_INTERFACE_LIBRARIES "")
+
+set (LIBS
+ ${CLANG_LIBS}
+ ${LLVM_LIBS_CORE}
+ ${LLVM_LIBS_JIT}
+ pthread
+ rt
+ dl
+ z
+ tinfo
+ m
+)
+
+if (SHANNON_BUILD)
+ LIST (APPEND LIBS
+ ${PROJECT_BINARY_DIR}/lib/liboclload.a
+ ${SDK}/sdk/pciedrv/lib/pciedrv.a`
+ ${SDK}/sdk/dnldmgr/lib/dnldmgr.a
+ ${SDK}/sdk/cmem/lib/cmem_drv.a
+ ${SDK}/sdk/bufmgr/lib/bufmgr.a
+ ${SDK}/sdk/mailBox/host/lib/mailBox.a
+ pciaccess
+ )
+elseif(HAWKING_BUILD)
+ LIST (APPEND LIBS
+ ${PROJECT_BINARY_DIR}/lib/liboclload.a
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmmailbox.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmtransport.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libmpmclient.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libticmem.so
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libkeystonemmap.so
+ # We don't really depend on libhyplnk but link against it
+ # to work around an mscsk issue.
+ ${LINUX_DEVKIT_ROOT}/usr/lib/libhyplnk_k2h.so)
+endif()
+
+if (NOT SHAMROCK_BUILD)
+if (HAWKING_CROSS_COMPILE)
+ SET(FFI_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libffi.so.6)
+ SET(BFD_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libbfd.so)
+ SET(SQLITE3_LIB ${LINUX_DEVKIT_ROOT}/usr/lib/libsqlite3.so.0)
+else()
+ find_library(FFI_LIB ffi)
+ find_library(BFD_LIB bfd)
+ find_library(SQLITE3_LIB sqlite3)
+endif()
+
+LIST (APPEND LIBS ${FFI_LIB} ${BFD_LIB} ${SQLITE3_LIB})
+endif (NOT SHAMROCK_BUILD)
+
+TARGET_LINK_LIBRARIES(OpenCL ${LIBS})
+install(TARGETS OpenCL LIBRARY DESTINATION lib ${OCL_FPERMS})
diff --git a/src/api/api_command.cpp b/src/api/api_command.cpp
new file mode 100644
index 0000000..e9972c6
--- /dev/null
+++ b/src/api/api_command.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_command.cpp
+ * \brief Command queues
+ */
+
+#include <core/commandqueue.h>
+#include <core/deviceinterface.h>
+#include <core/context.h>
+
+#include <CL/cl.h>
+
+// Command Queue APIs
+cl_command_queue
+clCreateCommandQueue(cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int * errcode_ret)
+{
+ cl_int default_errcode_ret;
+
+ // No errcode_ret ?
+ if (!errcode_ret)
+ errcode_ret = &default_errcode_ret;
+
+ if (!device->isA(Coal::Object::T_Device))
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return 0;
+ }
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+ Coal::CommandQueue *queue = new Coal::CommandQueue(
+ (Coal::Context *)context,
+ (Coal::DeviceInterface *)device,
+ properties,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ // Initialization failed, destroy context
+ delete queue;
+ return 0;
+ }
+
+ return (_cl_command_queue *)queue;
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->flush();
+
+ if (command_queue->dereference())
+ delete command_queue;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ return command_queue->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetCommandQueueProperty(cl_command_queue command_queue,
+ cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties * old_properties)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ return command_queue->setProperty(properties, enable, old_properties);
+}
diff --git a/src/api/api_context.cpp b/src/api/api_context.cpp
new file mode 100644
index 0000000..abe7be6
--- /dev/null
+++ b/src/api/api_context.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_context.cpp
+ * \brief Contexts
+ */
+
+#include <CL/cl.h>
+#include <core/context.h>
+#include <core/platform.h>
+#include <stdlib.h>
+
+// Context APIs
+
+cl_context
+clCreateContext(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_int default_errcode_ret;
+
+ // No errcode_ret ?
+ if (!errcode_ret)
+ errcode_ret = &default_errcode_ret;
+
+ if (!devices ||
+ !num_devices ||
+ (!pfn_notify && user_data))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+ Coal::Context *ctx = new Coal::Context(properties, num_devices, devices,
+ pfn_notify, user_data, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ // Initialization failed, destroy context
+ delete ctx;
+ return 0;
+ }
+
+ return (_cl_context *)ctx;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties *properties,
+ cl_device_type device_type,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_device_id* devices;
+ cl_uint num_devices;
+ cl_int local_error;
+ cl_context result = NULL;
+
+ local_error = clGetDeviceIDs(&the_platform, device_type, 0, NULL,
+ &num_devices);
+ if (!num_devices) { local_error = CL_INVALID_DEVICE; goto bail; }
+
+ devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
+ if (!devices) { local_error = CL_OUT_OF_HOST_MEMORY; goto bail; }
+
+ local_error = clGetDeviceIDs(&the_platform, device_type, num_devices,
+ devices, 0);
+
+ if (local_error != CL_SUCCESS) { free (devices); goto bail; }
+
+ result = clCreateContext(properties, num_devices, devices, pfn_notify, user_data,
+ &local_error);
+
+ free (devices);
+
+bail:
+ if (errcode_ret)
+ *errcode_ret = local_error;
+
+ return result;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ context->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ if (context->dereference())
+ delete context;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetContextInfo(cl_context context,
+ cl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ return context->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_device.cpp b/src/api/api_device.cpp
new file mode 100644
index 0000000..052f0b4
--- /dev/null
+++ b/src/api/api_device.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_device.cpp
+ * \brief Devices
+ */
+
+#include "CL/cl.h"
+#include <core/platform.h>
+#include <core/deviceinterface.h>
+
+cl_int
+clGetDeviceIDs(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices)
+{
+ /*-------------------------------------------------------------------------
+ * We currently implement only one platform
+ *------------------------------------------------------------------------*/
+ if (!platform) platform = &the_platform;
+
+ if (platform != &the_platform) return CL_INVALID_PLATFORM;
+ if (num_entries == 0 && devices != 0) return CL_INVALID_VALUE;
+ if (num_devices == 0 && devices == 0) return CL_INVALID_VALUE;
+
+ int device_number = platform->getDevices(device_type,
+ num_entries, devices);
+
+ if (num_devices) *num_devices = device_number;
+
+ if (device_number == 0)
+ return CL_DEVICE_NOT_FOUND;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!device->isA(Coal::Object::T_Device))
+ return CL_INVALID_DEVICE;
+
+ Coal::DeviceInterface *iface = (Coal::DeviceInterface *)device;
+ return iface->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_enqueue.cpp b/src/api/api_enqueue.cpp
new file mode 100644
index 0000000..5ed3b1a
--- /dev/null
+++ b/src/api/api_enqueue.cpp
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_enqueue.cpp
+ * \brief Events
+ */
+
+#include <CL/cl.h>
+
+#include <core/events.h>
+#include <core/memobject.h>
+
+#include <cstdlib>
+#include <stdio.h>
+
+static inline cl_int queueEvent(Coal::CommandQueue *queue,
+ Coal::Event *command,
+ cl_event *event,
+ cl_bool blocking)
+{
+ cl_int rs;
+ Coal::Event *old_event = NULL;
+
+ if (event)
+ {
+#if 0
+ /*---------------------------------------------------------------------
+ * It is up to the user to release events for reuse. If they do not
+ * they will have a memory leak for old events. This can impact
+ * memory performance since the old event memory is likely already warm
+ * in cache.
+ *--------------------------------------------------------------------*/
+ /*---------------------------------------------------------------------
+ * We should also reduce the reference count of the old event, because
+ * user_app_event is now interested in a different event.
+ *--------------------------------------------------------------------*/
+ old_event = *event;
+ if (old_event != NULL && old_event->isA(Coal::Object::T_Event))
+ clReleaseEvent((cl_event)old_event);
+
+#endif
+ /*---------------------------------------------------------------------
+ * We need to increase reference count before queue->queueEvent(command)
+ * because a user_app_event is interested in the status of command.
+ * Otherwise, if worker thread runs too fast, command becomes COMPLETE
+ * before we get here, command would have been cleaned from queue and
+ * deleted!!! Thus we will be left with a dangling pointer.
+ *--------------------------------------------------------------------*/
+ *event = (cl_event)command;
+ command->reference();
+ }
+
+ /*------------------------------------------------------------------------
+ * Same reason as above. We need to retain command for clWaitForEvents().
+ *-----------------------------------------------------------------------*/
+ if (blocking) command->reference();
+
+ rs = queue->queueEvent(command);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ if (blocking)
+ {
+ rs = clWaitForEvents(1, (cl_event *)&command);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+ command->dereference();
+ }
+
+ return CL_SUCCESS;
+}
+
+// Enqueued Commands APIs
+cl_int
+clEnqueueReadBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::ReadBufferEvent *command = new Coal::ReadBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteBufferEvent *command = new Coal::WriteBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::ReadBufferRectEvent *command = new Coal::ReadBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteBufferRectEvent *command = new Coal::WriteBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ buffer_origin, host_origin, region, buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferRectEvent *command = new Coal::CopyBufferRectEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer,
+ (Coal::MemObject *)dst_buffer,
+ src_origin, dst_origin, region, src_row_pitch, src_slice_pitch,
+ dst_row_pitch, dst_slice_pitch, 1,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferEvent *command = new Coal::CopyBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer,
+ (Coal::MemObject *)dst_buffer,
+ src_offset, dst_offset, cb,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_read,
+ const size_t * origin,
+ const size_t * region,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ if (!image || (image->type() != Coal::MemObject::Image2D &&
+ image->type() != Coal::MemObject::Image3D))
+ return CL_INVALID_MEM_OBJECT;
+
+ Coal::ReadImageEvent *command = new Coal::ReadImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ origin, region, row_pitch, slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_read);
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_write,
+ const size_t * origin,
+ const size_t * region,
+ size_t row_pitch,
+ size_t slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WriteImageEvent *command = new Coal::WriteImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ origin, region, row_pitch, slice_pitch, (void *)ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, blocking_write);
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_image,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyImageEvent *command = new Coal::CopyImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)src_image, (Coal::Image2D *)dst_image,
+ src_origin, dst_origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * region,
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyImageToBufferEvent *command = new Coal::CopyImageToBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)src_image, (Coal::MemObject *)dst_buffer,
+ src_origin, region, dst_offset,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_image,
+ size_t src_offset,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::CopyBufferToImageEvent *command = new Coal::CopyBufferToImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)src_buffer, (Coal::Image2D *)dst_image,
+ src_offset, dst_origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ *errcode_ret = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ *errcode_ret = CL_INVALID_COMMAND_QUEUE;
+ return 0;
+ }
+
+ Coal::MapBufferEvent *command = new Coal::MapBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)buffer,
+ offset, cb, map_flags,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ // We need command to be valid after queueEvent, so don't let the command
+ // queue handle it like a fire-and-forget event. Fixes a crash when event
+ // is NULL : the event gets deleted by clReleaseEvent called from
+ // CPUDevice's worker() and we then try to read it in command->ptr();
+ command->reference();
+
+ *errcode_ret = queueEvent(command_queue, command, event, blocking_map);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+ else
+ {
+ void *rs = command->ptr();
+
+ clReleaseEvent((cl_event)command);
+
+ return rs;
+ }
+}
+
+void *
+clEnqueueMapImage(cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t * origin,
+ const size_t * region,
+ size_t * image_row_pitch,
+ size_t * image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int rs;
+
+ if (!errcode_ret)
+ errcode_ret = &rs;
+
+ *errcode_ret = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ *errcode_ret = CL_INVALID_COMMAND_QUEUE;
+ return 0;
+ }
+
+ Coal::MapImageEvent *command = new Coal::MapImageEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Image2D *)image,
+ map_flags, origin, region,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ if (!image_row_pitch ||
+ (image->type() == Coal::MemObject::Image3D && !image_slice_pitch))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ delete command;
+ return 0;
+ }
+
+ command->reference(); // See clEnqueueMapImage for explanation.
+ *errcode_ret = queueEvent(command_queue, command, event, blocking_map);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+ else
+ {
+ *image_row_pitch = command->row_pitch();
+
+ if (image_slice_pitch)
+ *image_slice_pitch = command->slice_pitch();
+
+ void *rs = command->ptr();
+
+ clReleaseEvent((cl_event)command);
+
+ return rs;
+ }
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+ cl_mem memobj,
+ void * mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::UnmapBufferEvent *command = new Coal::UnmapBufferEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::MemObject *)memobj,
+ mapped_ptr,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t * global_work_offset,
+ const size_t * global_work_size,
+ const size_t * local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::KernelEvent *command = new Coal::KernelEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Kernel *)kernel,
+ work_dim, global_work_offset, global_work_size, local_work_size,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueTask(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ Coal::TaskEvent *command = new Coal::TaskEvent(
+ (Coal::CommandQueue *)command_queue,
+ (Coal::Kernel *)kernel,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue command_queue,
+ void (*user_func)(void *),
+ void * args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_list,
+ const void ** args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::NativeKernelEvent *command = new Coal::NativeKernelEvent(
+ (Coal::CommandQueue *)command_queue,
+ user_func, args, cb_args, num_mem_objects,
+ (const Coal::MemObject **)mem_list, args_mem_loc,
+ num_events_in_wait_list, (const Coal::Event **)event_wait_list, &rs
+ );
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue command_queue,
+ cl_event * event)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ if (!event)
+ return CL_INVALID_VALUE;
+
+ // Get the events in command_queue
+ unsigned int count;
+ Coal::Event **events = command_queue->events(count, false);
+
+ Coal::MarkerEvent *command = new Coal::MarkerEvent(
+ (Coal::CommandQueue *)command_queue,
+ count, count == 0 ? NULL : (const Coal::Event **)events, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ // Free events, they were memcpyed by Coal::Event
+ for (unsigned int i=0; i<count; ++i)
+ {
+ events[i]->dereference();
+ }
+
+ if (events != NULL) std::free(events);
+
+ return queueEvent(command_queue, command, event, false);
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+ cl_uint num_events,
+ const cl_event * event_list)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::WaitForEventsEvent *command = new Coal::WaitForEventsEvent(
+ (Coal::CommandQueue *)command_queue,
+ num_events, (const Coal::Event **)event_list, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, 0, false);
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ Coal::BarrierEvent *command = new Coal::BarrierEvent(
+ (Coal::CommandQueue *)command_queue, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ delete command;
+ return rs;
+ }
+
+ return queueEvent(command_queue, command, 0, false);
+}
diff --git a/src/api/api_event.cpp b/src/api/api_event.cpp
new file mode 100644
index 0000000..1e882bf
--- /dev/null
+++ b/src/api/api_event.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_event.cpp
+ * \brief Special events and event management
+ */
+
+#include <CL/cl.h>
+
+#include <core/commandqueue.h>
+#include <core/events.h>
+#include <core/context.h>
+#include <stdio.h>
+
+// Event Object APIs
+cl_int
+clWaitForEvents(cl_uint num_events,
+ const cl_event * event_list)
+{
+ if (!num_events || !event_list)
+ return CL_INVALID_VALUE;
+
+ // Check the events in the list
+ cl_context global_ctx = 0;
+
+ for (cl_uint i=0; i<num_events; ++i)
+ {
+ if (!event_list[i]->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (event_list[i]->status() < 0)
+ return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+
+ cl_context evt_ctx = (cl_context)event_list[i]->parent()->parent();
+
+#if 0 // YUAN: no need to wait for queue to be flushed
+ cl_command_queue evt_queue = (cl_command_queue)event_list[i]->parent();
+ // Flush the queue
+ evt_queue->flush();
+#endif
+
+ if (global_ctx == 0)
+ global_ctx = evt_ctx;
+ else if (global_ctx != evt_ctx)
+ return CL_INVALID_CONTEXT;
+ }
+
+ // Wait for the events
+ for (cl_uint i=0; i<num_events; ++i)
+ {
+ event_list[i]->waitForStatus(Coal::Event::Complete);
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetEventInfo(cl_event event,
+ cl_event_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ return event->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetEventCallback(cl_event event,
+ cl_int command_exec_callback_type,
+ void (CL_CALLBACK *pfn_event_notify)(cl_event event,
+ cl_int exec_status,
+ void *user_data),
+ void *user_data)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (!pfn_event_notify || command_exec_callback_type != CL_COMPLETE)
+ return CL_INVALID_VALUE;
+
+ event->setCallback(command_exec_callback_type, pfn_event_notify, user_data);
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clRetainEvent(cl_event event)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ event->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseEvent(cl_event event)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ if (event->dereference())
+ {
+ event->freeDeviceData();
+ delete event;
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_event
+clCreateUserEvent(cl_context context,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::UserEvent *command = new Coal::UserEvent(
+ (Coal::Context *)context, errcode_ret
+ );
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete command;
+ return 0;
+ }
+
+ return (cl_event)command;
+}
+
+cl_int
+clSetUserEventStatus(cl_event event,
+ cl_int execution_status)
+{
+ Coal::Event *command = (Coal::Event *)event;
+
+ if (!command->isA(Coal::Object::T_Event) ||
+ command->type() != Coal::Event::User)
+ return CL_INVALID_EVENT;
+
+ if (execution_status != CL_COMPLETE)
+ return CL_INVALID_VALUE;
+
+ if (command->status() != CL_SUBMITTED)
+ return CL_INVALID_OPERATION;
+
+ command->setStatus((Coal::Event::Status)execution_status);
+
+ return CL_SUCCESS;
+}
diff --git a/src/api/api_flush.cpp b/src/api/api_flush.cpp
new file mode 100644
index 0000000..c0e93a7
--- /dev/null
+++ b/src/api/api_flush.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_flush.cpp
+ * \brief clFlush and clFinish
+ */
+
+#include "CL/cl.h"
+#include "core/commandqueue.h"
+
+// Flush and Finish APIs
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->flush();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+ if (!command_queue->isA(Coal::Object::T_CommandQueue))
+ return CL_INVALID_COMMAND_QUEUE;
+
+ command_queue->finish();
+
+ return CL_SUCCESS;
+}
diff --git a/src/api/api_gl.cpp b/src/api/api_gl.cpp
new file mode 100644
index 0000000..0f06499
--- /dev/null
+++ b/src/api/api_gl.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_gl.cpp
+ * \brief OpenGL bindings (unimplemented)
+ */
+
+#define GL_GLEXT_PROTOTYPES
+#include "GL/gl.h"
+#include "GL/glext.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+
+cl_mem
+clCreateFromGLBuffer(cl_context context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLTexture2D(cl_context context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texture,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLTexture3D(cl_context context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texture,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_mem
+clCreateFromGLRenderbuffer(cl_context context,
+ cl_mem_flags flags,
+ GLuint renderbuffer,
+ int * errcode_ret)
+{
+ return 0;
+}
+
+cl_int
+clGetGLObjectInfo(cl_mem memobj,
+ cl_gl_object_type * gl_object_type,
+ GLuint * gl_object_name)
+{
+ return 0;
+}
+
+cl_int
+clGetGLTextureInfo(cl_mem memobj,
+ cl_gl_texture_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return 0;
+}
+
+cl_int
+clEnqueueAcquireGLObjects(cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ return 0;
+}
+
+cl_int
+clEnqueueReleaseGLObjects(cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ return 0;
+}
diff --git a/src/api/api_kernel.cpp b/src/api/api_kernel.cpp
new file mode 100644
index 0000000..abc492b
--- /dev/null
+++ b/src/api/api_kernel.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_kernel.cpp
+ * \brief Kernels
+ */
+
+#include "CL/cl.h"
+
+#include <core/program.h>
+#include <core/kernel.h>
+
+// Kernel Object APIs
+cl_kernel
+clCreateKernel(cl_program program,
+ const char * kernel_name,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!kernel_name)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ if (!program->isA(Coal::Object::T_Program))
+ {
+ *errcode_ret = CL_INVALID_PROGRAM;
+ return 0;
+ }
+
+ if (program->state() != Coal::Program::Built)
+ {
+ *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE;
+ return 0;
+ }
+
+ //Coal::Kernel *kernel = program->createKernel(kernel_name, errcode_ret);
+ Coal::Kernel *kernel = program->createKernelsAndReturnKernel(kernel_name, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete kernel;
+ return 0;
+ }
+
+ return (cl_kernel)kernel;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret)
+{
+ cl_int rs = CL_SUCCESS;
+
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (program->state() != Coal::Program::Built)
+ return CL_INVALID_PROGRAM_EXECUTABLE;
+
+ std::vector<Coal::Kernel *> ks = program->createKernels(&rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ }
+
+ return rs;
+ }
+
+ // Check that the kernels will fit in the array, if needed
+ if (num_kernels_ret)
+ *num_kernels_ret = ks.size();
+
+ if (kernels && num_kernels < ks.size())
+ {
+ while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ }
+
+ return CL_INVALID_VALUE;
+ }
+
+ if (!kernels)
+ {
+ // We don't need the kernels in fact
+ /* while (ks.size())
+ {
+ delete ks.back();
+ ks.pop_back();
+ } */
+ }
+ else
+ {
+ // Copy the kernels
+ for (size_t i=0; i<ks.size(); ++i)
+ {
+ kernels[i] = (cl_kernel)ks[i];
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clRetainKernel(cl_kernel kernel)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ kernel->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ if (kernel->dereference())
+ {
+ Coal::Program *p =(Coal::Program *) kernel->parent();
+
+ for (size_t i=0; i < p->kernelList.size(); i++)
+ {
+ if (p->kernelList[i]->p_name.compare(kernel->p_name) == 0)
+ {
+ p->kernelReleasedList.push_back(p->kernelList[i]);
+ p->kernelList.erase(p->kernelList.begin() + i);
+ // BUG: TAG
+ // For some odd reason when we delete this, we're corrupting then inside of some kernel objects
+ //delete kernel;
+ }
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clSetKernelArg(cl_kernel kernel,
+ cl_uint arg_indx,
+ size_t arg_size,
+ const void * arg_value)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->setArg(arg_indx, arg_size, arg_value);
+}
+
+cl_int
+clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!kernel->isA(Coal::Object::T_Kernel))
+ return CL_INVALID_KERNEL;
+
+ return kernel->workGroupInfo((Coal::DeviceInterface *)device, param_name,
+ param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_memory.cpp b/src/api/api_memory.cpp
new file mode 100644
index 0000000..18e6bab
--- /dev/null
+++ b/src/api/api_memory.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_memory.cpp
+ * \brief Memory objects
+ */
+
+#include "CL/cl.h"
+#include <core/memobject.h>
+#include <core/context.h>
+
+#include <cstring>
+
+// Memory Object APIs
+cl_mem
+clCreateBuffer(cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Buffer *buf = new Coal::Buffer(context, size, host_ptr, flags,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS)
+ {
+ delete buf;
+ return 0;
+ }
+
+ return (cl_mem)buf;
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem buffer,
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!buffer->isA(Coal::Object::T_MemObject))
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return 0;
+ }
+
+ Coal::MemObject *memobject = (Coal::MemObject *)buffer;
+ cl_buffer_region *region = (cl_buffer_region *)buffer_create_info;
+
+ // NOTE: Is it right ? Couldn't we create SubBuffers of images ?
+ if (memobject->type() != Coal::MemObject::Buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return 0;
+ }
+
+ if (buffer_create_type != CL_BUFFER_CREATE_TYPE_REGION)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ if (!buffer_create_info)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::SubBuffer *buf = new Coal::SubBuffer((Coal::Buffer *)buffer,
+ region->origin, region->size,
+ flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = buf->init()) != CL_SUCCESS)
+ {
+ delete buf;
+ return 0;
+ }
+
+ return (cl_mem)buf;
+}
+
+cl_mem
+clCreateImage2D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_row_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Image2D *image = new Coal::Image2D(context, image_width, image_height,
+ image_row_pitch, image_format,
+ host_ptr, flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS)
+ {
+ delete image;
+ return 0;
+ }
+
+ return (cl_mem)image;
+}
+
+cl_mem
+clCreateImage3D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_depth,
+ size_t image_row_pitch,
+ size_t image_slice_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Image3D *image = new Coal::Image3D(context, image_width, image_height,
+ image_depth, image_row_pitch,
+ image_slice_pitch, image_format,
+ host_ptr, flags, errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS || (*errcode_ret = image->init()) != CL_SUCCESS)
+ {
+ delete image;
+ return 0;
+ }
+
+ return (cl_mem)image;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ memobj->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ if (memobj->dereference())
+ delete memobj;
+
+ return CL_SUCCESS;
+}
+
+static cl_image_format supported_formats[] = {
+ { CL_RGBA, CL_UNORM_INT8 },
+ { CL_RGBA, CL_UNORM_INT16 },
+ { CL_RGBA, CL_SNORM_INT8 },
+ { CL_RGBA, CL_SNORM_INT16 },
+ { CL_RGBA, CL_SIGNED_INT8 },
+ { CL_RGBA, CL_SIGNED_INT16 },
+ { CL_RGBA, CL_SIGNED_INT32 },
+ { CL_RGBA, CL_UNSIGNED_INT8 },
+ { CL_RGBA, CL_UNSIGNED_INT16 },
+ { CL_RGBA, CL_UNSIGNED_INT32 },
+ { CL_RGBA, CL_FLOAT },
+
+ { CL_ARGB, CL_UNORM_INT8 },
+ { CL_ARGB, CL_SNORM_INT8 },
+ { CL_ARGB, CL_SIGNED_INT8 },
+ { CL_ARGB, CL_UNSIGNED_INT8 },
+
+ { CL_BGRA, CL_UNORM_INT8 },
+ { CL_BGRA, CL_SNORM_INT8 },
+ { CL_BGRA, CL_SIGNED_INT8 },
+ { CL_BGRA, CL_UNSIGNED_INT8 },
+
+ { CL_RGB, CL_UNORM_SHORT_565 },
+ { CL_RGB, CL_UNORM_SHORT_555 },
+ { CL_RGB, CL_UNORM_INT_101010 },
+
+ { CL_RGBx, CL_UNORM_SHORT_565 },
+ { CL_RGBx, CL_UNORM_SHORT_555 },
+ { CL_RGBx, CL_UNORM_INT_101010 },
+
+ { CL_RG, CL_UNORM_INT8 },
+ { CL_RG, CL_UNORM_INT16 },
+ { CL_RG, CL_SNORM_INT8 },
+ { CL_RG, CL_SNORM_INT16 },
+ { CL_RG, CL_SIGNED_INT8 },
+ { CL_RG, CL_SIGNED_INT16 },
+ { CL_RG, CL_SIGNED_INT32 },
+ { CL_RG, CL_UNSIGNED_INT8 },
+ { CL_RG, CL_UNSIGNED_INT16 },
+ { CL_RG, CL_UNSIGNED_INT32 },
+ { CL_RG, CL_FLOAT },
+
+ { CL_RGx, CL_UNORM_INT8 },
+ { CL_RGx, CL_UNORM_INT16 },
+ { CL_RGx, CL_SNORM_INT8 },
+ { CL_RGx, CL_SNORM_INT16 },
+ { CL_RGx, CL_SIGNED_INT8 },
+ { CL_RGx, CL_SIGNED_INT16 },
+ { CL_RGx, CL_SIGNED_INT32 },
+ { CL_RGx, CL_UNSIGNED_INT8 },
+ { CL_RGx, CL_UNSIGNED_INT16 },
+ { CL_RGx, CL_UNSIGNED_INT32 },
+ { CL_RGx, CL_FLOAT },
+
+ { CL_RA, CL_UNORM_INT8 },
+ { CL_RA, CL_UNORM_INT16 },
+ { CL_RA, CL_SNORM_INT8 },
+ { CL_RA, CL_SNORM_INT16 },
+ { CL_RA, CL_SIGNED_INT8 },
+ { CL_RA, CL_SIGNED_INT16 },
+ { CL_RA, CL_SIGNED_INT32 },
+ { CL_RA, CL_UNSIGNED_INT8 },
+ { CL_RA, CL_UNSIGNED_INT16 },
+ { CL_RA, CL_UNSIGNED_INT32 },
+ { CL_RA, CL_FLOAT },
+
+ { CL_R, CL_UNORM_INT8 },
+ { CL_R, CL_UNORM_INT16 },
+ { CL_R, CL_SNORM_INT8 },
+ { CL_R, CL_SNORM_INT16 },
+ { CL_R, CL_SIGNED_INT8 },
+ { CL_R, CL_SIGNED_INT16 },
+ { CL_R, CL_SIGNED_INT32 },
+ { CL_R, CL_UNSIGNED_INT8 },
+ { CL_R, CL_UNSIGNED_INT16 },
+ { CL_R, CL_UNSIGNED_INT32 },
+ { CL_R, CL_FLOAT },
+
+ { CL_Rx, CL_UNORM_INT8 },
+ { CL_Rx, CL_UNORM_INT16 },
+ { CL_Rx, CL_SNORM_INT8 },
+ { CL_Rx, CL_SNORM_INT16 },
+ { CL_Rx, CL_SIGNED_INT8 },
+ { CL_Rx, CL_SIGNED_INT16 },
+ { CL_Rx, CL_SIGNED_INT32 },
+ { CL_Rx, CL_UNSIGNED_INT8 },
+ { CL_Rx, CL_UNSIGNED_INT16 },
+ { CL_Rx, CL_UNSIGNED_INT32 },
+ { CL_Rx, CL_FLOAT },
+
+ { CL_A, CL_UNORM_INT8 },
+ { CL_A, CL_UNORM_INT16 },
+ { CL_A, CL_SNORM_INT8 },
+ { CL_A, CL_SNORM_INT16 },
+ { CL_A, CL_SIGNED_INT8 },
+ { CL_A, CL_SIGNED_INT16 },
+ { CL_A, CL_SIGNED_INT32 },
+ { CL_A, CL_UNSIGNED_INT8 },
+ { CL_A, CL_UNSIGNED_INT16 },
+ { CL_A, CL_UNSIGNED_INT32 },
+ { CL_A, CL_FLOAT },
+
+ { CL_LUMINANCE, CL_UNORM_INT8 },
+ { CL_LUMINANCE, CL_UNORM_INT16 },
+ { CL_LUMINANCE, CL_SNORM_INT8 },
+ { CL_LUMINANCE, CL_SNORM_INT16 },
+ { CL_LUMINANCE, CL_FLOAT },
+
+ { CL_INTENSITY, CL_UNORM_INT8 },
+ { CL_INTENSITY, CL_UNORM_INT16 },
+ { CL_INTENSITY, CL_SNORM_INT8 },
+ { CL_INTENSITY, CL_SNORM_INT16 },
+ { CL_INTENSITY, CL_FLOAT }
+};
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+cl_int
+clGetSupportedImageFormats(cl_context context,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats)
+{
+ if (!context->isA(Coal::Object::T_Context))
+ return CL_INVALID_CONTEXT;
+
+ (void) flags;
+ (void) image_type;
+
+ if (!num_entries && image_formats)
+ return CL_INVALID_VALUE;
+
+ if (image_formats)
+ {
+ std::memcpy(image_formats, supported_formats,
+ MIN(num_entries * sizeof(cl_image_format),
+ sizeof(supported_formats)));
+ }
+
+ if (num_image_formats)
+ *num_image_formats = sizeof(supported_formats) / sizeof(cl_image_format);
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ return memobj->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetImageInfo(cl_mem image,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!image->isA(Coal::Object::T_MemObject) ||
+ (image->type() != Coal::MemObject::Image2D &&
+ image->type() != Coal::MemObject::Image3D))
+ return CL_INVALID_MEM_OBJECT;
+
+ Coal::Image2D *image2d = (Coal::Image2D *)image;
+
+ return image2d->imageInfo(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetMemObjectDestructorCallback(cl_mem memobj,
+ void (CL_CALLBACK *pfn_notify)(cl_mem memobj,
+ void *user_data),
+ void * user_data)
+{
+ if (!memobj->isA(Coal::Object::T_MemObject))
+ return CL_INVALID_MEM_OBJECT;
+
+ memobj->setDestructorCallback(pfn_notify, user_data);
+
+ return CL_SUCCESS;
+}
+
diff --git a/src/api/api_platform.cpp b/src/api/api_platform.cpp
new file mode 100644
index 0000000..cf064ef
--- /dev/null
+++ b/src/api/api_platform.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_platform.cpp
+ * \brief Platform
+ */
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <core/platform.h>
+#include <core/config.h>
+#include <cstring>
+
+// Platform API
+
+cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if (num_platforms) *num_platforms = 1;
+ else if (!platforms) return CL_INVALID_VALUE;
+
+ if (!num_entries && platforms) return CL_INVALID_VALUE;
+
+ /*-------------------------------------------------------------------------
+ * Only one "default" platform
+ *------------------------------------------------------------------------*/
+ if (platforms != 0) *platforms = &the_platform;
+
+ return CL_SUCCESS;
+}
+
+cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ const char *string = 0;
+ unsigned long len = 0;
+
+ /*-------------------------------------------------------------------------
+ * NULL or what is returned by clGetPlatformIDs, that's to say also NULL
+ *------------------------------------------------------------------------*/
+ if (platform != &the_platform) return CL_INVALID_PLATFORM;
+
+ return platform->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+/******************************************************************************
+* Return a pointer to any supported extension functions
+******************************************************************************/
+void * clGetExtensionFunctionAddress(const char *funcname)
+{
+ if (strcmp(funcname, "clIcdGetPlatformIDsKHR") == 0)
+ return (void*)clGetPlatformIDs;
+
+ return NULL;
+}
+
diff --git a/src/api/api_profiling.cpp b/src/api/api_profiling.cpp
new file mode 100644
index 0000000..0abec66
--- /dev/null
+++ b/src/api/api_profiling.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_profiling.cpp
+ * \brief Profiling of events
+ */
+
+#include "CL/cl.h"
+#include <core/commandqueue.h>
+
+// Profiling APIs
+cl_int
+clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!event->isA(Coal::Object::T_Event))
+ return CL_INVALID_EVENT;
+
+ return event->profilingInfo(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
diff --git a/src/api/api_program.cpp b/src/api/api_program.cpp
new file mode 100644
index 0000000..af30510
--- /dev/null
+++ b/src/api/api_program.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_program.cpp
+ * \brief Programs
+ */
+
+#include "CL/cl.h"
+#include <core/program.h>
+#include <core/context.h>
+
+#include <cstdlib>
+
+// Program Object APIs
+cl_program
+clCreateProgramWithSource(cl_context context,
+ cl_uint count,
+ const char ** strings,
+ const size_t * lengths,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ if (!count || !strings)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ Coal::Program *program = new Coal::Program(context);
+
+ *errcode_ret = CL_SUCCESS;
+ *errcode_ret = program->loadSources(count, strings, lengths);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete program;
+ return 0;
+ }
+
+ return (cl_program)program;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const size_t * lengths,
+ const unsigned char **binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ if (!num_devices || !device_list || !lengths || !binaries)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ // Check the devices for compliance
+ cl_uint context_num_devices = 0;
+ cl_device_id *context_devices;
+
+ *errcode_ret = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint),
+ &context_num_devices, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return 0;
+
+ context_devices =
+ (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id));
+
+ *errcode_ret = context->info(CL_CONTEXT_DEVICES,
+ context_num_devices * sizeof(cl_device_id),
+ context_devices, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return 0;
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ bool found = false;
+
+ if (!lengths[i] || !binaries[i])
+ {
+ if (binary_status)
+ binary_status[i] = CL_INVALID_VALUE;
+
+ *errcode_ret = CL_INVALID_VALUE;
+ return 0;
+ }
+
+ for (cl_uint j=0; j<context_num_devices; ++j)
+ {
+ if (device_list[i] == context_devices[j])
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return 0;
+ }
+ }
+
+ // Create a program
+ Coal::Program *program = new Coal::Program(context);
+ *errcode_ret = CL_SUCCESS;
+
+ // Init program
+ *errcode_ret = program->loadBinaries(binaries,
+ lengths, binary_status, num_devices,
+ (Coal::DeviceInterface * const*)device_list);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete program;
+ return 0;
+ }
+
+ return (cl_program)program;
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ program->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (program->dereference())
+ delete program;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clBuildProgram(cl_program program,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ void (*pfn_notify)(cl_program program, void * user_data),
+ void * user_data)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (!device_list && num_devices > 0)
+ return CL_INVALID_VALUE;
+
+ if (!num_devices && device_list)
+ return CL_INVALID_VALUE;
+
+ if (!pfn_notify && user_data)
+ return CL_INVALID_VALUE;
+
+ cl_uint context_num_devices = 0;
+ cl_device_id *context_devices;
+ Coal::Context *context = (Coal::Context *)program->parent();
+ cl_int result;
+
+ result = context->info(CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint),
+ &context_num_devices, 0);
+
+ if (result != CL_SUCCESS) return result;
+
+ context_devices =
+ (cl_device_id *)std::malloc(context_num_devices * sizeof(cl_device_id));
+
+ result = context->info(CL_CONTEXT_DEVICES,
+ context_num_devices * sizeof(cl_device_id),
+ context_devices, 0);
+
+ if (result != CL_SUCCESS) return result;
+
+
+ // Check the devices for compliance
+ if (num_devices)
+ {
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ bool found = false;
+
+ for (cl_uint j=0; j<context_num_devices; ++j)
+ {
+ if (device_list[i] == context_devices[j])
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return CL_INVALID_DEVICE;
+ }
+ }
+ else
+ {
+ num_devices = context_num_devices;
+ device_list = context_devices;
+ }
+
+ // We cannot try to build a previously-failed program
+ if (!(program->state() == Coal::Program::Loaded ||
+ program->state() == Coal::Program::Built ))
+ return CL_INVALID_OPERATION;
+
+ // Build program
+ return program->build(options, pfn_notify, user_data, num_devices,
+ (Coal::DeviceInterface * const*)device_list);
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetProgramInfo(cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ return program->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!program->isA(Coal::Object::T_Program))
+ return CL_INVALID_PROGRAM;
+
+ if (!device)
+ return CL_INVALID_DEVICE;
+
+ return program->buildInfo((Coal::DeviceInterface *)device, param_name,
+ param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/api/api_sampler.cpp b/src/api/api_sampler.cpp
new file mode 100644
index 0000000..9bd2dec
--- /dev/null
+++ b/src/api/api_sampler.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file api_sampler.cpp
+ * \brief Samplers
+ */
+
+#include "CL/cl.h"
+
+#include "core/sampler.h"
+#include "core/context.h"
+
+// Sampler APIs
+cl_sampler
+clCreateSampler(cl_context context,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int * errcode_ret)
+{
+ cl_int dummy_errcode;
+
+ if (!errcode_ret)
+ errcode_ret = &dummy_errcode;
+
+ if (!context->isA(Coal::Object::T_Context))
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return 0;
+ }
+
+ *errcode_ret = CL_SUCCESS;
+
+ Coal::Sampler *sampler = new Coal::Sampler((Coal::Context *)context,
+ normalized_coords,
+ addressing_mode,
+ filter_mode,
+ errcode_ret);
+
+ if (*errcode_ret != CL_SUCCESS)
+ {
+ delete sampler;
+ return 0;
+ }
+
+ return (cl_sampler)sampler;
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ sampler->reference();
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ if (sampler->dereference())
+ delete sampler;
+
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler sampler,
+ cl_sampler_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (!sampler->isA(Coal::Object::T_Sampler))
+ return CL_INVALID_SAMPLER;
+
+ return sampler->info(param_name, param_value_size, param_value,
+ param_value_size_ret);
+}
diff --git a/src/builtins/CMakeLists.txt b/src/builtins/CMakeLists.txt
new file mode 100644
index 0000000..a83dfdf
--- /dev/null
+++ b/src/builtins/CMakeLists.txt
@@ -0,0 +1,33 @@
+if (SHAMROCK_BUILD)
+
+set(CUSTOM_COMMAND ${CLANG_EXECUTABLE} -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off )
+
+FILE(GLOB CL_SOURCES ${CLC_BUILTINS_DIR}/*.cl)
+#MESSAGE(STATUS "CL_SOURCES: ${CL_SOURCES}" )
+
+set(BC_SOURCES)
+foreach(f ${CL_SOURCES})
+ get_filename_component(fn ${f} NAME_WE)
+ #MESSAGE(STATUS "CL_SOURCE: ${f}" )
+ set(bc ${CMAKE_CURRENT_BINARY_DIR}/${fn}.bc)
+ add_custom_command(OUTPUT ${bc}
+ COMMAND ${CUSTOM_COMMAND}
+ -I${OCL_BUILTINS_DIR}/include
+ -o ${bc} ${f}
+ DEPENDS ${f}
+ COMMENT "Generating ${bc}")
+ list(APPEND BC_SOURCES ${bc})
+endforeach()
+#MESSAGE( STATUS "BC_SOURCES: ${BC_SOURCES}")
+
+add_custom_target(generate_bc_files DEPENDS ${BC_SOURCES})
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib
+ COMMAND llvm-link
+ -o ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib ${BC_SOURCES}
+ DEPENDS ${BC_SOURCES} )
+
+add_custom_target(generate_builtin_lib DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins.lib)
+
+endif(SHAMROCK_BUILD)
diff --git a/src/builtins/Makefile b/src/builtins/Makefile
new file mode 100644
index 0000000..1d3349b
--- /dev/null
+++ b/src/builtins/Makefile
@@ -0,0 +1,24 @@
+CLANG = clang
+CLANG_CFLAGS = -cc1 -emit-llvm-bc -x cl -O2 -fno-builtin -nobuiltininc
+CLANG_CFLAGS += -Fvisibility=protected -cl-std=CL1.2 -ffp-contract=off
+CLANG_CFLAGS += -I../../include
+
+CL_FILES = $(wildcard *.cl)
+BYTECODE := ${CL_FILES:.cl=.bc}
+
+all: builtins.lib
+
+builtins.lib: $(BYTECODE)
+ @echo $@ Linking bytecode modules
+ llvm-link -o $@ $^
+
+%.bc: %.cl
+ @echo $< Parsing
+ @$(CLANG) $(CLANG_CFLAGS) $< -o $@
+
+%.ll: %.bc
+ @echo $< Disassembling
+ llvm-dis $<
+
+clean:
+ @rm -rf *.bc *.ll
diff --git a/src/builtins/README.txt b/src/builtins/README.txt
new file mode 100644
index 0000000..5e16118
--- /dev/null
+++ b/src/builtins/README.txt
@@ -0,0 +1,13 @@
+This directory (builtins) is intended to supercede src/runtime as a means
+to provide a builtins library for OpenCL kernels.
+
+Note: some of the files here do not compile due to an address space casting
+error, and are suffixed *.cl.broken.
+
+Files here were imported from the TI opencl_builtins private repository and
+repurposed for CPU device (from DSP device).
+
+This library appears to have been adapted from libclc.llvm.org.
+
+The Makefile here is not used, but available for illustration purposes and
+to allow disassmbly of the bc files for inspection.
diff --git a/src/builtins/abs.cl b/src/builtins/abs.cl
new file mode 100644
index 0000000..71dcf75
--- /dev/null
+++ b/src/builtins/abs.cl
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+UNARY_VEC_DEF(char, uchar, abs, abs)
+UNARY_VEC_DEF(short, ushort, abs, abs)
+UNARY_VEC_DEF(int, uint, abs, abs)
+UNARY_VEC_DEF(long, ulong, abs, abs)
diff --git a/src/builtins/abs_diff.cl b/src/builtins/abs_diff.cl
new file mode 100644
index 0000000..ecc8e37
--- /dev/null
+++ b/src/builtins/abs_diff.cl
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+#define EXPAND_SIZES(type, utype) \
+ TEMPLATE(_VEC_TYPE(type,2), _VEC_TYPE(utype,2)) \
+ TEMPLATE(_VEC_TYPE(type,3), _VEC_TYPE(utype,3)) \
+ TEMPLATE(_VEC_TYPE(type,4), _VEC_TYPE(utype,4)) \
+ TEMPLATE(_VEC_TYPE(type,8), _VEC_TYPE(utype,8)) \
+ TEMPLATE(_VEC_TYPE(type,16), _VEC_TYPE(utype,16)) \
+
+#define TEMPLATE(gentype, ugentype) \
+ _CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \
+ { return __builtin_astype(x > y ? x-y : y-x, ugentype); }
+
+EXPAND_SIZES(uchar, uchar)
+EXPAND_SIZES(char, uchar)
+EXPAND_SIZES(ushort, ushort)
+EXPAND_SIZES(short, ushort)
+EXPAND_SIZES(uint, uint)
+EXPAND_SIZES(ulong, ulong)
+
+#undef TEMPLATE
+
+#define TEMPLATE(gentype, ugentype, shiftval) \
+_CLC_OVERLOAD _CLC_DEF ugentype abs_diff(gentype x, gentype y) \
+{ \
+ gentype signs_differ = (x^y) >> (gentype)shiftval; \
+ return (signs_differ) ? abs(x) + abs(y) : \
+ __builtin_astype(x > y ? x-y : y-x, ugentype); \
+}
+
+TEMPLATE(int, uint, 31)
+TEMPLATE(_VEC_TYPE(int,2), _VEC_TYPE(uint,2), 31)
+TEMPLATE(_VEC_TYPE(int,3), _VEC_TYPE(uint,3), 31)
+TEMPLATE(_VEC_TYPE(int,4), _VEC_TYPE(uint,4), 31)
+TEMPLATE(_VEC_TYPE(int,8), _VEC_TYPE(uint,8), 31)
+TEMPLATE(_VEC_TYPE(int,16), _VEC_TYPE(uint,16), 31)
+
+TEMPLATE(long, ulong, 63)
+TEMPLATE(_VEC_TYPE(long,2), _VEC_TYPE(ulong,2), 63)
+TEMPLATE(_VEC_TYPE(long,3), _VEC_TYPE(ulong,3), 63)
+TEMPLATE(_VEC_TYPE(long,4), _VEC_TYPE(ulong,4), 63)
+TEMPLATE(_VEC_TYPE(long,8), _VEC_TYPE(ulong,8), 63)
+TEMPLATE(_VEC_TYPE(long,16), _VEC_TYPE(ulong,16), 63)
+
+#undef TEMPLATE
diff --git a/src/builtins/add_sat.cl b/src/builtins/add_sat.cl
new file mode 100644
index 0000000..e70b3fb
--- /dev/null
+++ b/src/builtins/add_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+BINARY_VEC_DEF(char, char, add_sat, add_sat)
+BINARY_VEC_DEF(uchar, uchar, add_sat, add_sat)
+BINARY_VEC_DEF(short, short, add_sat, add_sat)
+BINARY_VEC_DEF(ushort, ushort, add_sat, add_sat)
+BINARY_VEC_DEF(int, int, add_sat, add_sat)
+BINARY_VEC_DEF(uint, uint, add_sat, add_sat)
+BINARY_VEC_DEF(long, long, add_sat, add_sat)
+BINARY_VEC_DEF(ulong, ulong, add_sat, add_sat)
diff --git a/src/builtins/all.cl b/src/builtins/all.cl
new file mode 100644
index 0000000..96a9ee2
--- /dev/null
+++ b/src/builtins/all.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_DEF int all(type##3 x) { return (x.s0 & x.s1 & x.s2) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##4 x) { return (x.s0 & x.s1 & x.s2 & x.s3) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##8 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \
+ x.s4 & x.s5 & x.s6 & x.s7) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int all(type##16 x) { return (x.s0 & x.s1 & x.s2 & x.s3 & \
+ x.s4 & x.s5 & x.s6 & x.s7 & \
+ x.s8 & x.s9 & x.sa & x.sb & \
+ x.sc & x.sd & x.se & x.sf) < 0; } \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
diff --git a/src/builtins/any.cl b/src/builtins/any.cl
new file mode 100644
index 0000000..57c4419
--- /dev/null
+++ b/src/builtins/any.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define TEMPLATE(type) \
+_CLC_OVERLOAD _CLC_DEF int any(type##3 x) { return (x.s0 | x.s1 | x.s2) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##4 x) { return (x.s0 | x.s1 | x.s2 | x.s3) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##8 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \
+ x.s4 | x.s5 | x.s6 | x.s7) < 0; } \
+_CLC_OVERLOAD _CLC_DEF int any(type##16 x) { return (x.s0 | x.s1 | x.s2 | x.s3 | \
+ x.s4 | x.s5 | x.s6 | x.s7 | \
+ x.s8 | x.s9 | x.sa | x.sb | \
+ x.sc | x.sd | x.se | x.sf) < 0; } \
+
+TEMPLATE(char)
+TEMPLATE(short)
+TEMPLATE(int)
+TEMPLATE(long)
diff --git a/src/builtins/atomics.cl.broken b/src/builtins/atomics.cl.broken
new file mode 100644
index 0000000..ed46888
--- /dev/null
+++ b/src/builtins/atomics.cl.broken
@@ -0,0 +1,558 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+void __sem_lock(int);
+void __sem_unlock(int);
+void __inv(char*, int);
+
+#define LOCK_GLOBAL __sem_lock(1)
+#define UNLOCK_GLOBAL __sem_unlock(1)
+#define INV_GLOBAL(p, sz) __inv((char*)(p), (sz))
+#define WB_GLOBAL(p, sz)
+
+#define LOCK_LOCAL
+#define UNLOCK_LOCAL
+#define INV_LOCAL(p, sz)
+#define WB_LOCAL(p, sz)
+
+_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old + val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old + val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_add(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old + val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_add(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old + val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old - val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old - val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_sub(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old - val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_sub(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old - val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile global float* p, float val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ float old = *p;
+ *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xchg(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xchg(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float* p, float val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ float old = *p;
+ *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile global int* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old + 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile global uint* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old + 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_inc(volatile local int* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old + 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_inc(volatile local uint* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old + 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile global int* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old - 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile global uint* p)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old - 1;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_dec(volatile local int* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old - 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_dec(volatile local uint* p)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old - 1;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile global int* p, int cmp, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (old == cmp) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile global uint* p, uint cmp, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (old == cmp) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_cmpxchg(volatile local int* p, int cmp, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (old == cmp) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_cmpxchg(volatile local uint* p, uint cmp, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (old == cmp) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (val < old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (val < old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_min(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (val < old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_min(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (val < old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ if (val > old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ if (val > old) *p = val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_max(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ if (val > old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_max(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ if (val > old) *p = val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old & val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old & val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_and(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old & val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_and(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old & val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old | val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old | val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_or(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old | val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_or(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old | val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile global int* p, int val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ int old = *p;
+ *p = old ^ val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile global uint* p, uint val)
+{
+ INV_GLOBAL(p, sizeof(*p));
+ LOCK_GLOBAL;
+ uint old = *p;
+ *p = old ^ val;
+ WB_GLOBAL(p, sizeof(*p));
+ UNLOCK_GLOBAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF int atomic_xor(volatile local int* p, int val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ int old = *p;
+ *p = old ^ val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint atomic_xor(volatile local uint* p, uint val)
+{
+ INV_LOCAL(p, sizeof(*p));
+ LOCK_LOCAL;
+ uint old = *p;
+ *p = old ^ val;
+ WB_LOCAL(p, sizeof(*p));
+ UNLOCK_LOCAL;
+ return old;
+}
+
diff --git a/src/builtins/bitselect.cl b/src/builtins/bitselect.cl
new file mode 100644
index 0000000..bf93a47
--- /dev/null
+++ b/src/builtins/bitselect.cl
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define DEFN(tname) \
+_CLC_OVERLOAD _CLC_DEF tname bitselect(tname a, tname b, tname c) { return a^(c&(b^a)); }
+
+DEFN(char2)
+DEFN(uchar2)
+DEFN(long2)
+DEFN(ulong2)
+
+DEFN(char3)
+DEFN(uchar3)
+DEFN(short3)
+DEFN(ushort3)
+DEFN(int3)
+DEFN(uint3)
+DEFN(long3)
+DEFN(ulong3)
+
+DEFN(int4)
+DEFN(uint4)
+DEFN(long4)
+DEFN(ulong4)
+
+DEFN(short8)
+DEFN(ushort8)
+DEFN(int8)
+DEFN(uint8)
+DEFN(long8)
+DEFN(ulong8)
+
+DEFN(char16)
+DEFN(uchar16)
+DEFN(short16)
+DEFN(ushort16)
+DEFN(int16)
+DEFN(uint16)
+DEFN(long16)
+DEFN(ulong16)
+
+_CLC_OVERLOAD _CLC_DEF float bitselect (float a, float b, float c)
+{ return __builtin_astype(__builtin_astype(a,int)^(__builtin_astype(c,int)&(__builtin_astype(b,int)^__builtin_astype(a,int))), float); }
+_CLC_OVERLOAD _CLC_DEF float2 bitselect (float2 a, float2 b, float2 c)
+{ return __builtin_astype(__builtin_astype(a,int2)^(__builtin_astype(c,int2)&(__builtin_astype(b,int2)^__builtin_astype(a,int2))), float2); }
+_CLC_OVERLOAD _CLC_DEF float3 bitselect (float3 a, float3 b, float3 c)
+{ return __builtin_astype(__builtin_astype(a,int3)^(__builtin_astype(c,int3)&(__builtin_astype(b,int3)^__builtin_astype(a,int3))), float3); }
+_CLC_OVERLOAD _CLC_DEF float4 bitselect (float4 a, float4 b, float4 c)
+{ return __builtin_astype(__builtin_astype(a,int4)^(__builtin_astype(c,int4)&(__builtin_astype(b,int4)^__builtin_astype(a,int4))), float4); }
+_CLC_OVERLOAD _CLC_DEF float8 bitselect (float8 a, float8 b, float8 c)
+{ return __builtin_astype(__builtin_astype(a,int8)^(__builtin_astype(c,int8)&(__builtin_astype(b,int8)^__builtin_astype(a,int8))), float8); }
+_CLC_OVERLOAD _CLC_DEF float16 bitselect (float16 a, float16 b, float16 c)
+{ return __builtin_astype(__builtin_astype(a,int16)^(__builtin_astype(c,int16)&(__builtin_astype(b,int16)^__builtin_astype(a,int16))), float16); }
+
+_CLC_OVERLOAD _CLC_DEF double bitselect (double a, double b, double c)
+{ return __builtin_astype(__builtin_astype(a,long)^(__builtin_astype(c,long)&(__builtin_astype(b,long)^__builtin_astype(a,long))), double); }
+_CLC_OVERLOAD _CLC_DEF double2 bitselect (double2 a, double2 b, double2 c)
+{ return __builtin_astype(__builtin_astype(a,long2)^(__builtin_astype(c,long2)&(__builtin_astype(b,long2)^__builtin_astype(a,long2))), double2); }
+_CLC_OVERLOAD _CLC_DEF double3 bitselect (double3 a, double3 b, double3 c)
+{ return __builtin_astype(__builtin_astype(a,long3)^(__builtin_astype(c,long3)&(__builtin_astype(b,long3)^__builtin_astype(a,long3))), double3); }
+_CLC_OVERLOAD _CLC_DEF double4 bitselect (double4 a, double4 b, double4 c)
+{ return __builtin_astype(__builtin_astype(a,long4)^(__builtin_astype(c,long4)&(__builtin_astype(b,long4)^__builtin_astype(a,long4))), double4); }
+_CLC_OVERLOAD _CLC_DEF double8 bitselect (double8 a, double8 b, double8 c)
+{ return __builtin_astype(__builtin_astype(a,long8)^(__builtin_astype(c,long8)&(__builtin_astype(b,long8)^__builtin_astype(a,long8))), double8); }
+_CLC_OVERLOAD _CLC_DEF double16 bitselect (double16 a, double16 b, double16 c)
+{ return __builtin_astype(__builtin_astype(a,long16)^(__builtin_astype(c,long16)&(__builtin_astype(b,long16)^__builtin_astype(a,long16))), double16); }
diff --git a/src/builtins/clamp.cl b/src/builtins/clamp.cl
new file mode 100644
index 0000000..78a29fb
--- /dev/null
+++ b/src/builtins/clamp.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,2), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, gentype minval, gentype maxval) \
+ { return x > maxval ? maxval : x < minval ? minval : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype clamp(gentype x, sgentype minval, sgentype maxval) \
+ { return x > (gentype)maxval ? (gentype)maxval : x < (gentype)minval ? (gentype)minval : x; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/clz.cl b/src/builtins/clz.cl
new file mode 100644
index 0000000..ac06119
--- /dev/null
+++ b/src/builtins/clz.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+UNARY_VEC_DEF(char, char, clz, clz)
+UNARY_VEC_DEF(uchar, uchar, clz, clz)
+UNARY_VEC_DEF(short, short, clz, clz)
+UNARY_VEC_DEF(ushort, ushort,clz, clz)
+UNARY_VEC_DEF(int, int, clz, clz)
+UNARY_VEC_DEF(uint, uint, clz, clz)
+UNARY_VEC_DEF(long, long, clz, clz)
+UNARY_VEC_DEF(ulong, ulong, clz, clz)
diff --git a/src/builtins/convert.cl b/src/builtins/convert.cl
new file mode 100644
index 0000000..2f47c2d
--- /dev/null
+++ b/src/builtins/convert.cl
@@ -0,0 +1,36122 @@
+/* !!!! AUTOGENERATED FILE generated by convert_type.py !!!!!
+
+ DON'T CHANGE THIS FILE. MAKE YOUR CHANGES TO convert_type.py AND RUN:
+ $ ./generate-conversion-type-cl.sh
+
+ OpenCL type conversion functions
+
+ Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
+ Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "clc.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define cles_khr_int64
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(char x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(char2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(char4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(char8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(char16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(char3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(char x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(char2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(char4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(char8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(char16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(char3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(uchar x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(uchar2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(uchar4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(uchar8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(uchar16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(uchar3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(uchar x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(uchar2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(uchar4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(uchar8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(uchar16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(uchar3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(short x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(short2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(short4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(short8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(short16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(short3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(short x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(short2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(short4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(short8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(short16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(short3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(ushort x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(ushort2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(ushort4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(ushort8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(ushort16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(ushort3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(ushort x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(ushort2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(ushort4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(ushort8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(ushort16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(ushort3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(int x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(int2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(int4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(int8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(int16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(int3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(int x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(int2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(int4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(int8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(int16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(int3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(uint x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(uint2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(uint4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(uint8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(uint16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(uint3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(uint x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(uint2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(uint4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(uint8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(uint16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(uint3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(long x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(long2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(long4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(long8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(long16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(long3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(long x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(long2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(long4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(long8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(long16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(long3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(ulong x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(ulong2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(ulong4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(ulong8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(ulong16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(ulong3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(ulong x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(ulong2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(ulong4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(ulong8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(ulong16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(ulong3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(float x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(float2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(float4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(float8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(float16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(float3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(float x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(float2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(float4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(float8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(float16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(float3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(float x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(float2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(float4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(float8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(float16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(float3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(float x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(float2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(float4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(float8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(float16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(float3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(float x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(float2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(float4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(float8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(float16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(float3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(float x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(float2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(float4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(float8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(float16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(float3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(float x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(float2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(float4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(float8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(float16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(float3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(float x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(float2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(float4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(float8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(float16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(float3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(float x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(float2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(float4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(float8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(float16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(float3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(float x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(float2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(float4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(float8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(float16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(float3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char(double x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2(double2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4(double4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8(double8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16(double16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3(double3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar(double x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2(double2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4(double4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8(double8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16(double16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3(double3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short(double x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2(double2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4(double4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8(double8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16(double16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3(double3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort(double x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2(double2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4(double4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8(double8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16(double16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3(double3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int(double x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2(double2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4(double4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8(double8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16(double16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3(double3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint(double x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2(double2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4(double4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8(double8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16(double16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3(double3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long(double x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2(double2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4(double4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8(double8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16(double16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3(double3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong(double x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2(double2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4(double4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8(double8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16(double16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3(double3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float(double x)
+{
+ return (float)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2(double2 x)
+{
+ return (float2)(convert_float(x.lo), convert_float(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4(double4 x)
+{
+ return (float4)(convert_float2(x.lo), convert_float2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8(double8 x)
+{
+ return (float8)(convert_float4(x.lo), convert_float4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16(double16 x)
+{
+ return (float16)(convert_float8(x.lo), convert_float8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3(double3 x)
+{
+ return (float3)(convert_float2(x.s01), convert_float(x.s2));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double(double x)
+{
+ return (double)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2(double2 x)
+{
+ return (double2)(convert_double(x.lo), convert_double(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4(double4 x)
+{
+ return (double4)(convert_double2(x.lo), convert_double2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8(double8 x)
+{
+ return (double8)(convert_double4(x.lo), convert_double4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16(double16 x)
+{
+ return (double16)(convert_double8(x.lo), convert_double8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3(double3 x)
+{
+ return (double3)(convert_double2(x.s01), convert_double(x.s2));
+}
+#endif
+
+
+#if 0 // ASW
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(char x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(char2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(char4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(char8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(char16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(char3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(char x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(char2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(char4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(char8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(char16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(char3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(char x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(char2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(char4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(char8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(char16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(char3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(char x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(char2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(char4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(char8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(char16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(char3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(char x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(char2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(char4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(char8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(char16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(char3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(char x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(char2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(char4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(char8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(char16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(char3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(char x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(char2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(char4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(char8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(char16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(char3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(char x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(char2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(char4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(char8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(char16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(char3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(uchar x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(uchar2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(uchar4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(uchar8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(uchar16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(uchar3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(uchar x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(uchar2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(uchar4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(uchar8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(uchar16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(uchar3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(uchar x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(uchar2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(uchar4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(uchar8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(uchar16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(uchar3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(uchar x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(uchar2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(uchar4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(uchar8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(uchar16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(uchar3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(uchar x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(uchar2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(uchar4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(uchar8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(uchar16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(uchar3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(uchar x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(uchar2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(uchar4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(uchar8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(uchar16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(uchar3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(uchar x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(uchar2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(uchar4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(uchar8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(uchar16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(uchar3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(uchar x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(uchar2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(uchar4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(uchar8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(uchar16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(uchar3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(short x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(short2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(short4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(short8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(short16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(short3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(short x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(short2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(short4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(short8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(short16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(short3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(short x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(short2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(short4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(short8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(short16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(short3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(short x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(short2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(short4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(short8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(short16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(short3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(short x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(short2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(short4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(short8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(short16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(short3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(short x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(short2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(short4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(short8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(short16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(short3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(short x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(short2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(short4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(short8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(short16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(short3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(short x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(short2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(short4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(short8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(short16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(short3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(ushort x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(ushort2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(ushort4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(ushort8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(ushort16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(ushort3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(ushort x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(ushort2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(ushort4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(ushort8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(ushort16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(ushort3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(ushort x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(ushort2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(ushort4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(ushort8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(ushort16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(ushort3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(ushort x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(ushort2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(ushort4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(ushort8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(ushort16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(ushort3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(ushort x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(ushort2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(ushort4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(ushort8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(ushort16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(ushort3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(ushort x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(ushort2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(ushort4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(ushort8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(ushort16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(ushort3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(ushort x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(ushort2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(ushort4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(ushort8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(ushort16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(ushort3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(ushort x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(ushort2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(ushort4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(ushort8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(ushort16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(ushort3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(int x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(int2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(int4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(int8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(int16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(int3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(int x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(int2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(int4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(int8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(int16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(int3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(int x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(int2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(int4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(int8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(int16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(int3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(int x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(int2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(int4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(int8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(int16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(int3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(int x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(int2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(int4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(int8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(int16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(int3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(int x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(int2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(int4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(int8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(int16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(int3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(int x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(int2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(int4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(int8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(int16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(int3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(int x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(int2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(int4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(int8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(int16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(int3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(uint x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(uint2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(uint4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(uint8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(uint16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(uint3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(uint x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(uint2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(uint4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(uint8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(uint16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(uint3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(uint x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(uint2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(uint4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(uint8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(uint16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(uint3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(uint x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(uint2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(uint4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(uint8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(uint16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(uint3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(uint x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(uint2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(uint4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(uint8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(uint16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(uint3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(uint x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(uint2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(uint4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(uint8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(uint16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(uint3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(uint x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(uint2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(uint4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(uint8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(uint16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(uint3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(uint x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(uint2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(uint4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(uint8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(uint16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(uint3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(long x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(long2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(long4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(long8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(long16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(long3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(long x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(long2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(long4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(long8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(long16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(long3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(long x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(long2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(long4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(long8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(long16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(long3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(long x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(long2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(long4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(long8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(long16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(long3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(long x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(long2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(long4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(long8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(long16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(long3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(long x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(long2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(long4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(long8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(long16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(long3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(long x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(long2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(long4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(long8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(long16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(long3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(long x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(long2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(long4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(long8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(long16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(long3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(ulong x)
+{
+ return (char)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(ulong2 x)
+{
+ return (char2)(convert_char(x.lo), convert_char(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(ulong4 x)
+{
+ return (char4)(convert_char2(x.lo), convert_char2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(ulong8 x)
+{
+ return (char8)(convert_char4(x.lo), convert_char4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(ulong16 x)
+{
+ return (char16)(convert_char8(x.lo), convert_char8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(ulong3 x)
+{
+ return (char3)(convert_char2(x.s01), convert_char(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(ulong x)
+{
+ return (uchar)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(ulong2 x)
+{
+ return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(ulong4 x)
+{
+ return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(ulong8 x)
+{
+ return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(ulong16 x)
+{
+ return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(ulong3 x)
+{
+ return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(ulong x)
+{
+ return (short)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(ulong2 x)
+{
+ return (short2)(convert_short(x.lo), convert_short(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(ulong4 x)
+{
+ return (short4)(convert_short2(x.lo), convert_short2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(ulong8 x)
+{
+ return (short8)(convert_short4(x.lo), convert_short4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(ulong16 x)
+{
+ return (short16)(convert_short8(x.lo), convert_short8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(ulong3 x)
+{
+ return (short3)(convert_short2(x.s01), convert_short(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(ulong x)
+{
+ return (ushort)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(ulong2 x)
+{
+ return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(ulong4 x)
+{
+ return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(ulong8 x)
+{
+ return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(ulong16 x)
+{
+ return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(ulong3 x)
+{
+ return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(ulong x)
+{
+ return (int)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(ulong2 x)
+{
+ return (int2)(convert_int(x.lo), convert_int(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(ulong4 x)
+{
+ return (int4)(convert_int2(x.lo), convert_int2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(ulong8 x)
+{
+ return (int8)(convert_int4(x.lo), convert_int4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(ulong16 x)
+{
+ return (int16)(convert_int8(x.lo), convert_int8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(ulong3 x)
+{
+ return (int3)(convert_int2(x.s01), convert_int(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(ulong x)
+{
+ return (uint)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(ulong2 x)
+{
+ return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(ulong4 x)
+{
+ return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(ulong8 x)
+{
+ return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(ulong16 x)
+{
+ return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(ulong3 x)
+{
+ return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(ulong x)
+{
+ return (long)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(ulong2 x)
+{
+ return (long2)(convert_long(x.lo), convert_long(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(ulong4 x)
+{
+ return (long4)(convert_long2(x.lo), convert_long2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(ulong8 x)
+{
+ return (long8)(convert_long4(x.lo), convert_long4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(ulong16 x)
+{
+ return (long16)(convert_long8(x.lo), convert_long8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(ulong3 x)
+{
+ return (long3)(convert_long2(x.s01), convert_long(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(ulong x)
+{
+ return (ulong)x;
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(ulong2 x)
+{
+ return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(ulong4 x)
+{
+ return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(ulong8 x)
+{
+ return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(ulong16 x)
+{
+ return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(ulong3 x)
+{
+ return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
+t
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(char x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(char2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(char3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(char4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(char8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(char16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(char x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(char2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(char3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(char4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(char8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(char16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(char x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(char2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(char3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(char4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(char8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(char16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(char x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(char2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(char3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(char4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(char8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(char16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(char x)
+{
+ x = max(x, (char)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(char2 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(char3 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(char4 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(char8 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(char16 x)
+{
+ x = max(x, (char)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(uchar x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(uchar2 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(uchar3 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(uchar4 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(uchar8 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(uchar16 x)
+{
+ x = min(x, (uchar)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(uchar x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(uchar2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(uchar3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(uchar4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(uchar8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(uchar16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(uchar x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(uchar2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(uchar3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(uchar4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(uchar8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(uchar16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(uchar x)
+{
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(uchar2 x)
+{
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(uchar3 x)
+{
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(uchar4 x)
+{
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(uchar8 x)
+{
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(uchar16 x)
+{
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(uchar x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(uchar2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(uchar3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(uchar4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(uchar8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(uchar16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(uchar x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(uchar2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(uchar3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(uchar4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(uchar8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(uchar16 x)
+{
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(uchar x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(uchar2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(uchar3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(uchar4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(uchar8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(uchar16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(uchar x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(uchar2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(uchar3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(uchar4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(uchar8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(uchar16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(short x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(short2 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(short3 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(short4 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(short8 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(short16 x)
+{
+ x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(short x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(short2 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(short3 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(short4 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(short8 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(short16 x)
+{
+ x = clamp(x, (short)0, (short)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(short x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(short2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(short3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(short4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(short8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(short16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(short x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(short2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(short3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(short4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(short8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(short16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(short x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(short2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(short3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(short4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(short8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(short16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(short x)
+{
+ x = max(x, (short)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(short2 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(short3 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(short4 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(short8 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(short16 x)
+{
+ x = max(x, (short)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(ushort x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(ushort2 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(ushort3 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(ushort4 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(ushort8 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(ushort16 x)
+{
+ x = min(x, (ushort)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(ushort x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(ushort2 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(ushort3 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(ushort4 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(ushort8 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(ushort16 x)
+{
+ x = min(x, (ushort)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(ushort x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(ushort2 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(ushort3 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(ushort4 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(ushort8 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(ushort16 x)
+{
+ x = min(x, (ushort)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(ushort x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(ushort2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(ushort3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(ushort4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(ushort8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(ushort16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(ushort x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(ushort2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(ushort3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(ushort4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(ushort8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(ushort16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(ushort x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(ushort2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(ushort3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(ushort4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(ushort8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(ushort16 x)
+{
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(ushort x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(ushort2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(ushort3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(ushort4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(ushort8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(ushort16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(ushort x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(ushort2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(ushort3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(ushort4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(ushort8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(ushort16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(int x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(int2 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(int3 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(int4 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(int8 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(int16 x)
+{
+ x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(int x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(int2 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(int3 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(int4 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(int8 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(int16 x)
+{
+ x = clamp(x, (int)0, (int)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(int x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(int2 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(int3 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(int4 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(int8 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(int16 x)
+{
+ x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(int x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(int2 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(int3 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(int4 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(int8 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(int16 x)
+{
+ x = clamp(x, (int)0, (int)USHRT_MAX);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(int x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(int2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(int3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(int4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(int8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(int16 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(int x)
+{
+ x = max(x, (int)0);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(int2 x)
+{
+ x = max(x, (int)0);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(int3 x)
+{
+ x = max(x, (int)0);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(int4 x)
+{
+ x = max(x, (int)0);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(int8 x)
+{
+ x = max(x, (int)0);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(int16 x)
+{
+ x = max(x, (int)0);
+ return convert_uint16(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(int x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(int2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(int3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(int4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(int8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(int16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(int x)
+{
+ x = max(x, (int)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(int2 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(int3 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(int4 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(int8 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(int16 x)
+{
+ x = max(x, (int)0);
+ return convert_ulong16(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(uint x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(uint2 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(uint3 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(uint4 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(uint8 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(uint16 x)
+{
+ x = min(x, (uint)CHAR_MAX);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(uint x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(uint2 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(uint3 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(uint4 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(uint8 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(uint16 x)
+{
+ x = min(x, (uint)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(uint x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(uint2 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(uint3 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(uint4 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(uint8 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(uint16 x)
+{
+ x = min(x, (uint)SHRT_MAX);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(uint x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(uint2 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(uint3 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(uint4 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(uint8 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(uint16 x)
+{
+ x = min(x, (uint)USHRT_MAX);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(uint x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(uint2 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(uint3 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(uint4 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(uint8 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(uint16 x)
+{
+ x = min(x, (uint)INT_MAX);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(uint x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(uint2 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(uint3 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(uint4 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(uint8 x)
+{
+ return x;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(uint16 x)
+{
+ return x;
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(uint x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(uint2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(uint3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(uint4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(uint8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(uint16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(uint x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(uint2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(uint3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(uint4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(uint8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(uint16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(long x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(long2 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(long3 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(long4 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(long8 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(long16 x)
+{
+ x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
+ return convert_char16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(long x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(long x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(long2 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(long3 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(long4 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(long8 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(long16 x)
+{
+ x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
+ return convert_short16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(long x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)USHRT_MAX);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(long x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(long2 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(long3 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(long4 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(long8 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(long16 x)
+{
+ x = clamp(x, (long)INT_MIN, (long)INT_MAX);
+ return convert_int16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(long x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(long2 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(long3 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(long4 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(long8 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(long16 x)
+{
+ x = clamp(x, (long)0, (long)UINT_MAX);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(long x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(long2 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(long3 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(long4 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(long8 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(long16 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(long x)
+{
+ x = max(x, (long)0);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(long2 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(long3 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(long4 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(long8 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(long16 x)
+{
+ x = max(x, (long)0);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(ulong x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(ulong2 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(ulong3 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(ulong4 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(ulong8 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(ulong16 x)
+{
+ x = min(x, (ulong)CHAR_MAX);
+ return convert_char16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(ulong x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(ulong2 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(ulong3 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(ulong4 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(ulong8 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(ulong16 x)
+{
+ x = min(x, (ulong)UCHAR_MAX);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(ulong x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(ulong2 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(ulong3 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(ulong4 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(ulong8 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(ulong16 x)
+{
+ x = min(x, (ulong)SHRT_MAX);
+ return convert_short16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(ulong x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(ulong2 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(ulong3 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(ulong4 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(ulong8 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(ulong16 x)
+{
+ x = min(x, (ulong)USHRT_MAX);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(ulong x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(ulong2 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(ulong3 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(ulong4 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(ulong8 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(ulong16 x)
+{
+ x = min(x, (ulong)INT_MAX);
+ return convert_int16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(ulong x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(ulong2 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(ulong3 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(ulong4 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(ulong8 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(ulong16 x)
+{
+ x = min(x, (ulong)UINT_MAX);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(ulong x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(ulong2 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(ulong3 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(ulong4 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(ulong8 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(ulong16 x)
+{
+ x = min(x, (ulong)LONG_MAX);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(ulong x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(ulong2 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(ulong3 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(ulong4 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(ulong8 x)
+{
+ return x;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(ulong16 x)
+{
+ return x;
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(float x)
+{
+ char y = convert_char(x);
+ y = select(y, (char)CHAR_MIN, convert_char(x < (float)CHAR_MIN));
+ y = select(y, (char)CHAR_MAX, convert_char(x > (float)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(float2 x)
+{
+ char2 y = convert_char2(x);
+ y = select(y, (char2)CHAR_MIN, convert_char2(x < (float2)CHAR_MIN));
+ y = select(y, (char2)CHAR_MAX, convert_char2(x > (float2)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(float3 x)
+{
+ char3 y = convert_char3(x);
+ y = select(y, (char3)CHAR_MIN, convert_char3(x < (float3)CHAR_MIN));
+ y = select(y, (char3)CHAR_MAX, convert_char3(x > (float3)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(float4 x)
+{
+ char4 y = convert_char4(x);
+ y = select(y, (char4)CHAR_MIN, convert_char4(x < (float4)CHAR_MIN));
+ y = select(y, (char4)CHAR_MAX, convert_char4(x > (float4)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(float8 x)
+{
+ char8 y = convert_char8(x);
+ y = select(y, (char8)CHAR_MIN, convert_char8(x < (float8)CHAR_MIN));
+ y = select(y, (char8)CHAR_MAX, convert_char8(x > (float8)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(float16 x)
+{
+ char16 y = convert_char16(x);
+ y = select(y, (char16)CHAR_MIN, convert_char16(x < (float16)CHAR_MIN));
+ y = select(y, (char16)CHAR_MAX, convert_char16(x > (float16)CHAR_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(float x)
+{
+ uchar y = convert_uchar(x);
+ y = select(y, (uchar)0, as_uchar(convert_char(x < (float)0)));
+ y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (float)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(float2 x)
+{
+ uchar2 y = convert_uchar2(x);
+ y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (float2)0)));
+ y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (float2)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(float3 x)
+{
+ uchar3 y = convert_uchar3(x);
+ y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (float3)0)));
+ y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (float3)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(float4 x)
+{
+ uchar4 y = convert_uchar4(x);
+ y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (float4)0)));
+ y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (float4)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(float8 x)
+{
+ uchar8 y = convert_uchar8(x);
+ y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (float8)0)));
+ y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (float8)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(float16 x)
+{
+ uchar16 y = convert_uchar16(x);
+ y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (float16)0)));
+ y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (float16)UCHAR_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(float x)
+{
+ short y = convert_short(x);
+ y = select(y, (short)SHRT_MIN, convert_short(x < (float)SHRT_MIN));
+ y = select(y, (short)SHRT_MAX, convert_short(x > (float)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(float2 x)
+{
+ short2 y = convert_short2(x);
+ y = select(y, (short2)SHRT_MIN, convert_short2(x < (float2)SHRT_MIN));
+ y = select(y, (short2)SHRT_MAX, convert_short2(x > (float2)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(float3 x)
+{
+ short3 y = convert_short3(x);
+ y = select(y, (short3)SHRT_MIN, convert_short3(x < (float3)SHRT_MIN));
+ y = select(y, (short3)SHRT_MAX, convert_short3(x > (float3)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(float4 x)
+{
+ short4 y = convert_short4(x);
+ y = select(y, (short4)SHRT_MIN, convert_short4(x < (float4)SHRT_MIN));
+ y = select(y, (short4)SHRT_MAX, convert_short4(x > (float4)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(float8 x)
+{
+ short8 y = convert_short8(x);
+ y = select(y, (short8)SHRT_MIN, convert_short8(x < (float8)SHRT_MIN));
+ y = select(y, (short8)SHRT_MAX, convert_short8(x > (float8)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(float16 x)
+{
+ short16 y = convert_short16(x);
+ y = select(y, (short16)SHRT_MIN, convert_short16(x < (float16)SHRT_MIN));
+ y = select(y, (short16)SHRT_MAX, convert_short16(x > (float16)SHRT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(float x)
+{
+ ushort y = convert_ushort(x);
+ y = select(y, (ushort)0, as_ushort(convert_short(x < (float)0)));
+ y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (float)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(float2 x)
+{
+ ushort2 y = convert_ushort2(x);
+ y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (float2)0)));
+ y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (float2)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(float3 x)
+{
+ ushort3 y = convert_ushort3(x);
+ y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (float3)0)));
+ y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (float3)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(float4 x)
+{
+ ushort4 y = convert_ushort4(x);
+ y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (float4)0)));
+ y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (float4)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(float8 x)
+{
+ ushort8 y = convert_ushort8(x);
+ y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (float8)0)));
+ y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (float8)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(float16 x)
+{
+ ushort16 y = convert_ushort16(x);
+ y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (float16)0)));
+ y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (float16)USHRT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(float x)
+{
+ int y = convert_int(x);
+ y = select(y, (int)INT_MIN, convert_int(x < (float)INT_MIN));
+ y = select(y, (int)INT_MAX, convert_int(x > (float)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(float2 x)
+{
+ int2 y = convert_int2(x);
+ y = select(y, (int2)INT_MIN, convert_int2(x < (float2)INT_MIN));
+ y = select(y, (int2)INT_MAX, convert_int2(x > (float2)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(float3 x)
+{
+ int3 y = convert_int3(x);
+ y = select(y, (int3)INT_MIN, convert_int3(x < (float3)INT_MIN));
+ y = select(y, (int3)INT_MAX, convert_int3(x > (float3)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(float4 x)
+{
+ int4 y = convert_int4(x);
+ y = select(y, (int4)INT_MIN, convert_int4(x < (float4)INT_MIN));
+ y = select(y, (int4)INT_MAX, convert_int4(x > (float4)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(float8 x)
+{
+ int8 y = convert_int8(x);
+ y = select(y, (int8)INT_MIN, convert_int8(x < (float8)INT_MIN));
+ y = select(y, (int8)INT_MAX, convert_int8(x > (float8)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(float16 x)
+{
+ int16 y = convert_int16(x);
+ y = select(y, (int16)INT_MIN, convert_int16(x < (float16)INT_MIN));
+ y = select(y, (int16)INT_MAX, convert_int16(x > (float16)INT_MAX));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(float x)
+{
+ uint y = convert_uint(x);
+ y = select(y, (uint)0, as_uint(convert_int(x < (float)0)));
+ y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (float)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(float2 x)
+{
+ uint2 y = convert_uint2(x);
+ y = select(y, (uint2)0, as_uint2(convert_int2(x < (float2)0)));
+ y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (float2)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(float3 x)
+{
+ uint3 y = convert_uint3(x);
+ y = select(y, (uint3)0, as_uint3(convert_int3(x < (float3)0)));
+ y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (float3)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(float4 x)
+{
+ uint4 y = convert_uint4(x);
+ y = select(y, (uint4)0, as_uint4(convert_int4(x < (float4)0)));
+ y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (float4)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(float8 x)
+{
+ uint8 y = convert_uint8(x);
+ y = select(y, (uint8)0, as_uint8(convert_int8(x < (float8)0)));
+ y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (float8)UINT_MAX)));
+ return y;
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(float16 x)
+{
+ uint16 y = convert_uint16(x);
+ y = select(y, (uint16)0, as_uint16(convert_int16(x < (float16)0)));
+ y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (float16)UINT_MAX)));
+ return y;
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(float x)
+{
+ long y = convert_long(x);
+ y = select(y, (long)LONG_MIN, convert_long(x < (float)LONG_MIN));
+ y = select(y, (long)LONG_MAX, convert_long(x > (float)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(float2 x)
+{
+ long2 y = convert_long2(x);
+ y = select(y, (long2)LONG_MIN, convert_long2(x < (float2)LONG_MIN));
+ y = select(y, (long2)LONG_MAX, convert_long2(x > (float2)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(float3 x)
+{
+ long3 y = convert_long3(x);
+ y = select(y, (long3)LONG_MIN, convert_long3(x < (float3)LONG_MIN));
+ y = select(y, (long3)LONG_MAX, convert_long3(x > (float3)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(float4 x)
+{
+ long4 y = convert_long4(x);
+ y = select(y, (long4)LONG_MIN, convert_long4(x < (float4)LONG_MIN));
+ y = select(y, (long4)LONG_MAX, convert_long4(x > (float4)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(float8 x)
+{
+ long8 y = convert_long8(x);
+ y = select(y, (long8)LONG_MIN, convert_long8(x < (float8)LONG_MIN));
+ y = select(y, (long8)LONG_MAX, convert_long8(x > (float8)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(float16 x)
+{
+ long16 y = convert_long16(x);
+ y = select(y, (long16)LONG_MIN, convert_long16(x < (float16)LONG_MIN));
+ y = select(y, (long16)LONG_MAX, convert_long16(x > (float16)LONG_MAX));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(float x)
+{
+ ulong y = convert_ulong(x);
+ y = select(y, (ulong)0, as_ulong(convert_long(x < (float)0)));
+ y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (float)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(float2 x)
+{
+ ulong2 y = convert_ulong2(x);
+ y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (float2)0)));
+ y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (float2)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(float3 x)
+{
+ ulong3 y = convert_ulong3(x);
+ y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (float3)0)));
+ y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (float3)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(float4 x)
+{
+ ulong4 y = convert_ulong4(x);
+ y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (float4)0)));
+ y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (float4)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(float8 x)
+{
+ ulong8 y = convert_ulong8(x);
+ y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (float8)0)));
+ y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (float8)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(float16 x)
+{
+ ulong16 y = convert_ulong16(x);
+ y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (float16)0)));
+ y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (float16)ULONG_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat(double x)
+{
+ char y = convert_char(x);
+ y = select(y, (char)CHAR_MIN, convert_char(x < (double)CHAR_MIN));
+ y = select(y, (char)CHAR_MAX, convert_char(x > (double)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat(double2 x)
+{
+ char2 y = convert_char2(x);
+ y = select(y, (char2)CHAR_MIN, convert_char2(x < (double2)CHAR_MIN));
+ y = select(y, (char2)CHAR_MAX, convert_char2(x > (double2)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat(double3 x)
+{
+ char3 y = convert_char3(x);
+ y = select(y, (char3)CHAR_MIN, convert_char3(x < (double3)CHAR_MIN));
+ y = select(y, (char3)CHAR_MAX, convert_char3(x > (double3)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat(double4 x)
+{
+ char4 y = convert_char4(x);
+ y = select(y, (char4)CHAR_MIN, convert_char4(x < (double4)CHAR_MIN));
+ y = select(y, (char4)CHAR_MAX, convert_char4(x > (double4)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat(double8 x)
+{
+ char8 y = convert_char8(x);
+ y = select(y, (char8)CHAR_MIN, convert_char8(x < (double8)CHAR_MIN));
+ y = select(y, (char8)CHAR_MAX, convert_char8(x > (double8)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat(double16 x)
+{
+ char16 y = convert_char16(x);
+ y = select(y, (char16)CHAR_MIN, convert_char16(x < (double16)CHAR_MIN));
+ y = select(y, (char16)CHAR_MAX, convert_char16(x > (double16)CHAR_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat(double x)
+{
+ uchar y = convert_uchar(x);
+ y = select(y, (uchar)0, as_uchar(convert_char(x < (double)0)));
+ y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (double)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat(double2 x)
+{
+ uchar2 y = convert_uchar2(x);
+ y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (double2)0)));
+ y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (double2)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat(double3 x)
+{
+ uchar3 y = convert_uchar3(x);
+ y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (double3)0)));
+ y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (double3)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat(double4 x)
+{
+ uchar4 y = convert_uchar4(x);
+ y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (double4)0)));
+ y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (double4)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat(double8 x)
+{
+ uchar8 y = convert_uchar8(x);
+ y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (double8)0)));
+ y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (double8)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat(double16 x)
+{
+ uchar16 y = convert_uchar16(x);
+ y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (double16)0)));
+ y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (double16)UCHAR_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat(double x)
+{
+ short y = convert_short(x);
+ y = select(y, (short)SHRT_MIN, convert_short(x < (double)SHRT_MIN));
+ y = select(y, (short)SHRT_MAX, convert_short(x > (double)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat(double2 x)
+{
+ short2 y = convert_short2(x);
+ y = select(y, (short2)SHRT_MIN, convert_short2(x < (double2)SHRT_MIN));
+ y = select(y, (short2)SHRT_MAX, convert_short2(x > (double2)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat(double3 x)
+{
+ short3 y = convert_short3(x);
+ y = select(y, (short3)SHRT_MIN, convert_short3(x < (double3)SHRT_MIN));
+ y = select(y, (short3)SHRT_MAX, convert_short3(x > (double3)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat(double4 x)
+{
+ short4 y = convert_short4(x);
+ y = select(y, (short4)SHRT_MIN, convert_short4(x < (double4)SHRT_MIN));
+ y = select(y, (short4)SHRT_MAX, convert_short4(x > (double4)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat(double8 x)
+{
+ short8 y = convert_short8(x);
+ y = select(y, (short8)SHRT_MIN, convert_short8(x < (double8)SHRT_MIN));
+ y = select(y, (short8)SHRT_MAX, convert_short8(x > (double8)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat(double16 x)
+{
+ short16 y = convert_short16(x);
+ y = select(y, (short16)SHRT_MIN, convert_short16(x < (double16)SHRT_MIN));
+ y = select(y, (short16)SHRT_MAX, convert_short16(x > (double16)SHRT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat(double x)
+{
+ ushort y = convert_ushort(x);
+ y = select(y, (ushort)0, as_ushort(convert_short(x < (double)0)));
+ y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (double)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat(double2 x)
+{
+ ushort2 y = convert_ushort2(x);
+ y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (double2)0)));
+ y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (double2)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat(double3 x)
+{
+ ushort3 y = convert_ushort3(x);
+ y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (double3)0)));
+ y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (double3)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat(double4 x)
+{
+ ushort4 y = convert_ushort4(x);
+ y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (double4)0)));
+ y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (double4)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat(double8 x)
+{
+ ushort8 y = convert_ushort8(x);
+ y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (double8)0)));
+ y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (double8)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat(double16 x)
+{
+ ushort16 y = convert_ushort16(x);
+ y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (double16)0)));
+ y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (double16)USHRT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat(double x)
+{
+ int y = convert_int(x);
+ y = select(y, (int)INT_MIN, convert_int(x < (double)INT_MIN));
+ y = select(y, (int)INT_MAX, convert_int(x > (double)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat(double2 x)
+{
+ int2 y = convert_int2(x);
+ y = select(y, (int2)INT_MIN, convert_int2(x < (double2)INT_MIN));
+ y = select(y, (int2)INT_MAX, convert_int2(x > (double2)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat(double3 x)
+{
+ int3 y = convert_int3(x);
+ y = select(y, (int3)INT_MIN, convert_int3(x < (double3)INT_MIN));
+ y = select(y, (int3)INT_MAX, convert_int3(x > (double3)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat(double4 x)
+{
+ int4 y = convert_int4(x);
+ y = select(y, (int4)INT_MIN, convert_int4(x < (double4)INT_MIN));
+ y = select(y, (int4)INT_MAX, convert_int4(x > (double4)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat(double8 x)
+{
+ int8 y = convert_int8(x);
+ y = select(y, (int8)INT_MIN, convert_int8(x < (double8)INT_MIN));
+ y = select(y, (int8)INT_MAX, convert_int8(x > (double8)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat(double16 x)
+{
+ int16 y = convert_int16(x);
+ y = select(y, (int16)INT_MIN, convert_int16(x < (double16)INT_MIN));
+ y = select(y, (int16)INT_MAX, convert_int16(x > (double16)INT_MAX));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat(double x)
+{
+ uint y = convert_uint(x);
+ y = select(y, (uint)0, as_uint(convert_int(x < (double)0)));
+ y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (double)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat(double2 x)
+{
+ uint2 y = convert_uint2(x);
+ y = select(y, (uint2)0, as_uint2(convert_int2(x < (double2)0)));
+ y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (double2)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat(double3 x)
+{
+ uint3 y = convert_uint3(x);
+ y = select(y, (uint3)0, as_uint3(convert_int3(x < (double3)0)));
+ y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (double3)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat(double4 x)
+{
+ uint4 y = convert_uint4(x);
+ y = select(y, (uint4)0, as_uint4(convert_int4(x < (double4)0)));
+ y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (double4)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat(double8 x)
+{
+ uint8 y = convert_uint8(x);
+ y = select(y, (uint8)0, as_uint8(convert_int8(x < (double8)0)));
+ y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (double8)UINT_MAX)));
+ return y;
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat(double16 x)
+{
+ uint16 y = convert_uint16(x);
+ y = select(y, (uint16)0, as_uint16(convert_int16(x < (double16)0)));
+ y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (double16)UINT_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat(double x)
+{
+ long y = convert_long(x);
+ y = select(y, (long)LONG_MIN, convert_long(x < (double)LONG_MIN));
+ y = select(y, (long)LONG_MAX, convert_long(x > (double)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat(double2 x)
+{
+ long2 y = convert_long2(x);
+ y = select(y, (long2)LONG_MIN, convert_long2(x < (double2)LONG_MIN));
+ y = select(y, (long2)LONG_MAX, convert_long2(x > (double2)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat(double3 x)
+{
+ long3 y = convert_long3(x);
+ y = select(y, (long3)LONG_MIN, convert_long3(x < (double3)LONG_MIN));
+ y = select(y, (long3)LONG_MAX, convert_long3(x > (double3)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat(double4 x)
+{
+ long4 y = convert_long4(x);
+ y = select(y, (long4)LONG_MIN, convert_long4(x < (double4)LONG_MIN));
+ y = select(y, (long4)LONG_MAX, convert_long4(x > (double4)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat(double8 x)
+{
+ long8 y = convert_long8(x);
+ y = select(y, (long8)LONG_MIN, convert_long8(x < (double8)LONG_MIN));
+ y = select(y, (long8)LONG_MAX, convert_long8(x > (double8)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat(double16 x)
+{
+ long16 y = convert_long16(x);
+ y = select(y, (long16)LONG_MIN, convert_long16(x < (double16)LONG_MIN));
+ y = select(y, (long16)LONG_MAX, convert_long16(x > (double16)LONG_MAX));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat(double x)
+{
+ ulong y = convert_ulong(x);
+ y = select(y, (ulong)0, as_ulong(convert_long(x < (double)0)));
+ y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (double)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat(double2 x)
+{
+ ulong2 y = convert_ulong2(x);
+ y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (double2)0)));
+ y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (double2)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat(double3 x)
+{
+ ulong3 y = convert_ulong3(x);
+ y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (double3)0)));
+ y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (double3)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat(double4 x)
+{
+ ulong4 y = convert_ulong4(x);
+ y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (double4)0)));
+ y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (double4)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat(double8 x)
+{
+ ulong8 y = convert_ulong8(x);
+ y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (double8)0)));
+ y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (double8)ULONG_MAX)));
+ return y;
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat(double16 x)
+{
+ ulong16 y = convert_ulong16(x);
+ y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (double16)0)));
+ y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (double16)ULONG_MAX)));
+ return y;
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(char x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(char2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(char3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(char4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(char8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(char16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(char x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(char2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(char3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(char4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(char8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(char16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(char x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(char2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(char3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(char4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(char8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(char16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(char x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(char2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(char3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(char4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(char8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(char16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(char x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(char2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(char3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(char4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(char8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(char16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(char x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(char2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(char3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(char4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(char8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(char16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(char x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(char2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(char3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(char4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(char8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(char16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(char x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(char2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(char3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(char4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(char8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(char16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(uchar x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(uchar2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(uchar3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(uchar4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(uchar8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(uchar16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(uchar x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(uchar2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(uchar3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(uchar4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(uchar8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(uchar16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(uchar x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(uchar2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(uchar3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(uchar4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(uchar8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(uchar16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(uchar x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(uchar2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(uchar3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(uchar4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(uchar8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(uchar16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(uchar x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(uchar2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(uchar3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(uchar4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(uchar8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(uchar16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(uchar x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(uchar2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(uchar3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(uchar4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(uchar8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(uchar16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(uchar x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(uchar2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(uchar3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(uchar4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(uchar8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(uchar16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(uchar x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(uchar2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(uchar3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(uchar4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(uchar8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(uchar16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(short x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(short2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(short3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(short4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(short8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(short16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(short x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(short2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(short3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(short4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(short8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(short16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(short x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(short2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(short3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(short4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(short8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(short16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(short x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(short2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(short3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(short4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(short8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(short16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(short x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(short2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(short3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(short4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(short8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(short16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(short x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(short2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(short3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(short4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(short8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(short16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(short x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(short2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(short3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(short4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(short8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(short16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(short x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(short2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(short3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(short4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(short8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(short16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(ushort x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(ushort2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(ushort3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(ushort4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(ushort8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(ushort16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(ushort x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(ushort2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(ushort3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(ushort4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(ushort8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(ushort16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(ushort x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(ushort2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(ushort3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(ushort4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(ushort8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(ushort16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(ushort x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(ushort2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(ushort3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(ushort4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(ushort8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(ushort16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(ushort x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(ushort2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(ushort3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(ushort4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(ushort8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(ushort16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(ushort x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(ushort2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(ushort3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(ushort4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(ushort8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(ushort16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(ushort x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(ushort2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(ushort3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(ushort4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(ushort8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(ushort16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(ushort x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(ushort2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(ushort3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(ushort4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(ushort8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(ushort16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(int x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(int2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(int3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(int4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(int8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(int16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(int x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(int2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(int3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(int4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(int8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(int16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(int x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(int2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(int3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(int4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(int8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(int16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(int x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(int2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(int3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(int4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(int8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(int16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(int x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(int2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(int3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(int4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(int8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(int16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(int x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(int2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(int3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(int4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(int8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(int16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(int x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(int2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(int3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(int4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(int8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(int16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(int x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(int2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(int3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(int4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(int8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(int16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(uint x)
+{
+ return convert_char_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(uint2 x)
+{
+ return convert_char2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(uint3 x)
+{
+ return convert_char3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(uint4 x)
+{
+ return convert_char4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(uint8 x)
+{
+ return convert_char8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(uint16 x)
+{
+ return convert_char16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(uint x)
+{
+ return convert_uchar_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(uint2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(uint3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(uint4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(uint8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(uint16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(uint x)
+{
+ return convert_short_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(uint2 x)
+{
+ return convert_short2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(uint3 x)
+{
+ return convert_short3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(uint4 x)
+{
+ return convert_short4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(uint8 x)
+{
+ return convert_short8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(uint16 x)
+{
+ return convert_short16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(uint x)
+{
+ return convert_ushort_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(uint2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(uint3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(uint4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(uint8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(uint16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(uint x)
+{
+ return convert_int_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(uint2 x)
+{
+ return convert_int2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(uint3 x)
+{
+ return convert_int3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(uint4 x)
+{
+ return convert_int4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(uint8 x)
+{
+ return convert_int8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(uint16 x)
+{
+ return convert_int16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(uint x)
+{
+ return convert_uint_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(uint2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(uint3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(uint4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(uint8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(uint16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(uint x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(uint2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(uint3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(uint4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(uint8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(uint16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(uint x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(uint2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(uint3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(uint4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(uint8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(uint16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(long x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(long2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(long3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(long4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(long8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(long16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(long x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(long2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(long3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(long4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(long8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(long16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(long x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(long2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(long3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(long4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(long8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(long16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(long x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(long2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(long3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(long4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(long8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(long16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(long x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(long2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(long3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(long4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(long8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(long16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(long x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(long2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(long3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(long4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(long8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(long16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(long x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(long2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(long3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(long4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(long8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(long16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(long x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(long2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(long3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(long4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(long8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(long16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(ulong x)
+{
+ return convert_char_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(ulong2 x)
+{
+ return convert_char2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(ulong3 x)
+{
+ return convert_char3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(ulong4 x)
+{
+ return convert_char4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(ulong8 x)
+{
+ return convert_char8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(ulong16 x)
+{
+ return convert_char16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(ulong x)
+{
+ return convert_uchar_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(ulong2 x)
+{
+ return convert_uchar2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(ulong3 x)
+{
+ return convert_uchar3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(ulong4 x)
+{
+ return convert_uchar4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(ulong8 x)
+{
+ return convert_uchar8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(ulong16 x)
+{
+ return convert_uchar16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(ulong x)
+{
+ return convert_short_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(ulong2 x)
+{
+ return convert_short2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(ulong3 x)
+{
+ return convert_short3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(ulong4 x)
+{
+ return convert_short4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(ulong8 x)
+{
+ return convert_short8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(ulong16 x)
+{
+ return convert_short16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(ulong x)
+{
+ return convert_ushort_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(ulong2 x)
+{
+ return convert_ushort2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(ulong3 x)
+{
+ return convert_ushort3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(ulong4 x)
+{
+ return convert_ushort4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(ulong8 x)
+{
+ return convert_ushort8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(ulong16 x)
+{
+ return convert_ushort16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(ulong x)
+{
+ return convert_int_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(ulong2 x)
+{
+ return convert_int2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(ulong3 x)
+{
+ return convert_int3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(ulong4 x)
+{
+ return convert_int4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(ulong8 x)
+{
+ return convert_int8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(ulong16 x)
+{
+ return convert_int16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(ulong x)
+{
+ return convert_uint_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(ulong2 x)
+{
+ return convert_uint2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(ulong3 x)
+{
+ return convert_uint3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(ulong4 x)
+{
+ return convert_uint4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(ulong8 x)
+{
+ return convert_uint8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(ulong16 x)
+{
+ return convert_uint16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(ulong x)
+{
+ return convert_long_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(ulong2 x)
+{
+ return convert_long2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(ulong3 x)
+{
+ return convert_long3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(ulong4 x)
+{
+ return convert_long4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(ulong8 x)
+{
+ return convert_long8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(ulong16 x)
+{
+ return convert_long16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(ulong x)
+{
+ return convert_ulong_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(ulong2 x)
+{
+ return convert_ulong2_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(ulong3 x)
+{
+ return convert_ulong3_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(ulong4 x)
+{
+ return convert_ulong4_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(ulong8 x)
+{
+ return convert_ulong8_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(ulong16 x)
+{
+ return convert_ulong16_sat(x);
+}
+
+#endif
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(float x)
+{
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(float x)
+{
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(float x)
+{
+ x = rint(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(float x)
+{
+ x = ceil(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(float x)
+{
+ x = floor(x);
+ return convert_char(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_char_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(float2 x)
+{
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(float2 x)
+{
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_char2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_char2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(float3 x)
+{
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(float3 x)
+{
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_char3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_char3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(float4 x)
+{
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(float4 x)
+{
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_char4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_char4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(float8 x)
+{
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(float8 x)
+{
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_char8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_char8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(float16 x)
+{
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(float16 x)
+{
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_char16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_char16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(float x)
+{
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(float x)
+{
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(float x)
+{
+ x = rint(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(float x)
+{
+ x = floor(x);
+ return convert_uchar(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_uchar_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(float2 x)
+{
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(float2 x)
+{
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uchar2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uchar2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(float3 x)
+{
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(float3 x)
+{
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uchar3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uchar3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(float4 x)
+{
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(float4 x)
+{
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uchar4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uchar4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(float8 x)
+{
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(float8 x)
+{
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uchar8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uchar8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(float16 x)
+{
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(float16 x)
+{
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uchar16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uchar16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(float x)
+{
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(float x)
+{
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(float x)
+{
+ x = rint(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(float x)
+{
+ x = ceil(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(float x)
+{
+ x = floor(x);
+ return convert_short(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_short_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(float2 x)
+{
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(float2 x)
+{
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_short2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_short2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(float3 x)
+{
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(float3 x)
+{
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_short3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_short3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(float4 x)
+{
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(float4 x)
+{
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_short4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_short4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(float8 x)
+{
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(float8 x)
+{
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_short8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_short8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(float16 x)
+{
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(float16 x)
+{
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_short16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_short16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(float x)
+{
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(float x)
+{
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(float x)
+{
+ x = rint(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(float x)
+{
+ x = floor(x);
+ return convert_ushort(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_ushort_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(float2 x)
+{
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(float2 x)
+{
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ushort2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ushort2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(float3 x)
+{
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(float3 x)
+{
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ushort3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ushort3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(float4 x)
+{
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(float4 x)
+{
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ushort4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ushort4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(float8 x)
+{
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(float8 x)
+{
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ushort8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ushort8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(float16 x)
+{
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(float16 x)
+{
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ushort16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ushort16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(float x)
+{
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(float x)
+{
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(float x)
+{
+ x = rint(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(float x)
+{
+ x = ceil(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(float x)
+{
+ x = floor(x);
+ return convert_int(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_int_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(float2 x)
+{
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(float2 x)
+{
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_int2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_int2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(float3 x)
+{
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(float3 x)
+{
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_int3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_int3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(float4 x)
+{
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(float4 x)
+{
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_int4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_int4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(float8 x)
+{
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(float8 x)
+{
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_int8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_int8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(float16 x)
+{
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(float16 x)
+{
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_int16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_int16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(float x)
+{
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(float x)
+{
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(float x)
+{
+ x = rint(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(float x)
+{
+ x = floor(x);
+ return convert_uint(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_uint_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(float2 x)
+{
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(float2 x)
+{
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uint2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_uint2_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(float3 x)
+{
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(float3 x)
+{
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uint3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_uint3_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(float4 x)
+{
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(float4 x)
+{
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uint4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_uint4_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(float8 x)
+{
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(float8 x)
+{
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uint8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_uint8_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(float16 x)
+{
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(float16 x)
+{
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_uint16_sat(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uint16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_uint16_sat(x);
+}
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(float x)
+{
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(float x)
+{
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(float x)
+{
+ x = rint(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(float x)
+{
+ x = ceil(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(float x)
+{
+ x = floor(x);
+ return convert_long(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_long_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(float2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(float2 x)
+{
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_long2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_long2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(float3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(float3 x)
+{
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_long3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_long3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(float4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(float4 x)
+{
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_long4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_long4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(float8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(float8 x)
+{
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_long8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_long8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(float16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(float16 x)
+{
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_long16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_long16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(float x)
+{
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(float x)
+{
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(float x)
+{
+ x = rint(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(float x)
+{
+ x = rint(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(float x)
+{
+ x = ceil(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(float x)
+{
+ x = floor(x);
+ return convert_ulong(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(float x)
+{
+ x = floor(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(float2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(float2 x)
+{
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(float2 x)
+{
+ x = rint(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(float2 x)
+{
+ x = ceil(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ulong2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(float2 x)
+{
+ x = floor(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(float3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(float3 x)
+{
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(float3 x)
+{
+ x = rint(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(float3 x)
+{
+ x = ceil(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ulong3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(float3 x)
+{
+ x = floor(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(float4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(float4 x)
+{
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(float4 x)
+{
+ x = rint(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(float4 x)
+{
+ x = ceil(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ulong4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(float4 x)
+{
+ x = floor(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(float8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(float8 x)
+{
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(float8 x)
+{
+ x = rint(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(float8 x)
+{
+ x = ceil(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ulong8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(float8 x)
+{
+ x = floor(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(float16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(float16 x)
+{
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(float16 x)
+{
+ x = rint(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(float16 x)
+{
+ x = ceil(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ulong16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(float16 x)
+{
+ x = floor(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtz(double x)
+{
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtz(double x)
+{
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rte(double x)
+{
+ x = rint(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtp(double x)
+{
+ x = ceil(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_rtn(double x)
+{
+ x = floor(x);
+ return convert_char(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char convert_char_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_char_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtz(double2 x)
+{
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtz(double2 x)
+{
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_char2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char2 convert_char2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_char2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtz(double3 x)
+{
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtz(double3 x)
+{
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_char3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char3 convert_char3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_char3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtz(double4 x)
+{
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtz(double4 x)
+{
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_char4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char4 convert_char4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_char4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtz(double8 x)
+{
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtz(double8 x)
+{
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_char8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char8 convert_char8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_char8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtz(double16 x)
+{
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtz(double16 x)
+{
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_char16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+char16 convert_char16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_char16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtz(double x)
+{
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtz(double x)
+{
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rte(double x)
+{
+ x = rint(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_rtn(double x)
+{
+ x = floor(x);
+ return convert_uchar(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar convert_uchar_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_uchar_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtz(double2 x)
+{
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtz(double2 x)
+{
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uchar2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar2 convert_uchar2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uchar2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtz(double3 x)
+{
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtz(double3 x)
+{
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uchar3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar3 convert_uchar3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uchar3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtz(double4 x)
+{
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtz(double4 x)
+{
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uchar4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar4 convert_uchar4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uchar4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtz(double8 x)
+{
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtz(double8 x)
+{
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uchar8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar8 convert_uchar8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uchar8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtz(double16 x)
+{
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtz(double16 x)
+{
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uchar16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uchar16 convert_uchar16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uchar16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtz(double x)
+{
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtz(double x)
+{
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rte(double x)
+{
+ x = rint(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtp(double x)
+{
+ x = ceil(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_rtn(double x)
+{
+ x = floor(x);
+ return convert_short(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short convert_short_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_short_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtz(double2 x)
+{
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtz(double2 x)
+{
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_short2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short2 convert_short2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_short2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtz(double3 x)
+{
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtz(double3 x)
+{
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_short3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short3 convert_short3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_short3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtz(double4 x)
+{
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtz(double4 x)
+{
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_short4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short4 convert_short4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_short4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtz(double8 x)
+{
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtz(double8 x)
+{
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_short8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short8 convert_short8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_short8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtz(double16 x)
+{
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtz(double16 x)
+{
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_short16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+short16 convert_short16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_short16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtz(double x)
+{
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtz(double x)
+{
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rte(double x)
+{
+ x = rint(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_rtn(double x)
+{
+ x = floor(x);
+ return convert_ushort(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort convert_ushort_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_ushort_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtz(double2 x)
+{
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtz(double2 x)
+{
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ushort2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort2 convert_ushort2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ushort2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtz(double3 x)
+{
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtz(double3 x)
+{
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ushort3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort3 convert_ushort3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ushort3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtz(double4 x)
+{
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtz(double4 x)
+{
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ushort4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort4 convert_ushort4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ushort4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtz(double8 x)
+{
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtz(double8 x)
+{
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ushort8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort8 convert_ushort8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ushort8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtz(double16 x)
+{
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtz(double16 x)
+{
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ushort16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+ushort16 convert_ushort16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ushort16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtz(double x)
+{
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtz(double x)
+{
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rte(double x)
+{
+ x = rint(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtp(double x)
+{
+ x = ceil(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_rtn(double x)
+{
+ x = floor(x);
+ return convert_int(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int convert_int_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_int_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtz(double2 x)
+{
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtz(double2 x)
+{
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_int2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int2 convert_int2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_int2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtz(double3 x)
+{
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtz(double3 x)
+{
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_int3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int3 convert_int3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_int3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtz(double4 x)
+{
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtz(double4 x)
+{
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_int4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int4 convert_int4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_int4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtz(double8 x)
+{
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtz(double8 x)
+{
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_int8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int8 convert_int8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_int8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtz(double16 x)
+{
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtz(double16 x)
+{
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_int16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+int16 convert_int16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_int16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtz(double x)
+{
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtz(double x)
+{
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rte(double x)
+{
+ x = rint(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_rtn(double x)
+{
+ x = floor(x);
+ return convert_uint(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint convert_uint_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_uint_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtz(double2 x)
+{
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtz(double2 x)
+{
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uint2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint2 convert_uint2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_uint2_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtz(double3 x)
+{
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtz(double3 x)
+{
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uint3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint3 convert_uint3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_uint3_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtz(double4 x)
+{
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtz(double4 x)
+{
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uint4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint4 convert_uint4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_uint4_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtz(double8 x)
+{
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtz(double8 x)
+{
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uint8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint8 convert_uint8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_uint8_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtz(double16 x)
+{
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtz(double16 x)
+{
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uint16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+uint16 convert_uint16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_uint16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtz(double x)
+{
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtz(double x)
+{
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rte(double x)
+{
+ x = rint(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtp(double x)
+{
+ x = ceil(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_rtn(double x)
+{
+ x = floor(x);
+ return convert_long(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long convert_long_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_long_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtz(double2 x)
+{
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtz(double2 x)
+{
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_long2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long2 convert_long2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_long2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtz(double3 x)
+{
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtz(double3 x)
+{
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_long3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long3 convert_long3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_long3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtz(double4 x)
+{
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtz(double4 x)
+{
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_long4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long4 convert_long4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_long4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtz(double8 x)
+{
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtz(double8 x)
+{
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_long8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long8 convert_long8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_long8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtz(double16 x)
+{
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtz(double16 x)
+{
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_long16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+long16 convert_long16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_long16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtz(double x)
+{
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtz(double x)
+{
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rte(double x)
+{
+ x = rint(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rte(double x)
+{
+ x = rint(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtp(double x)
+{
+ x = ceil(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_rtn(double x)
+{
+ x = floor(x);
+ return convert_ulong(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong convert_ulong_sat_rtn(double x)
+{
+ x = floor(x);
+ return convert_ulong_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtz(double2 x)
+{
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtz(double2 x)
+{
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rte(double2 x)
+{
+ x = rint(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtp(double2 x)
+{
+ x = ceil(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ulong2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong2 convert_ulong2_sat_rtn(double2 x)
+{
+ x = floor(x);
+ return convert_ulong2_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtz(double3 x)
+{
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtz(double3 x)
+{
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rte(double3 x)
+{
+ x = rint(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtp(double3 x)
+{
+ x = ceil(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ulong3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong3 convert_ulong3_sat_rtn(double3 x)
+{
+ x = floor(x);
+ return convert_ulong3_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtz(double4 x)
+{
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtz(double4 x)
+{
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rte(double4 x)
+{
+ x = rint(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtp(double4 x)
+{
+ x = ceil(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ulong4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong4 convert_ulong4_sat_rtn(double4 x)
+{
+ x = floor(x);
+ return convert_ulong4_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtz(double8 x)
+{
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtz(double8 x)
+{
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rte(double8 x)
+{
+ x = rint(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtp(double8 x)
+{
+ x = ceil(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ulong8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong8 convert_ulong8_sat_rtn(double8 x)
+{
+ x = floor(x);
+ return convert_ulong8_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtz(double16 x)
+{
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtz(double16 x)
+{
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rte(double16 x)
+{
+ x = rint(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtp(double16 x)
+{
+ x = ceil(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ulong16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+ulong16 convert_ulong16_sat_rtn(double16 x)
+{
+ x = floor(x);
+ return convert_ulong16_sat(x);
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(char x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(char x)
+{
+ float r = convert_float(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(char2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(char2 x)
+{
+ float2 r = convert_float2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(char3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(char3 x)
+{
+ float3 r = convert_float3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(char4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(char4 x)
+{
+ float4 r = convert_float4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(char8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(char8 x)
+{
+ float8 r = convert_float8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(char16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(char16 x)
+{
+ float16 r = convert_float16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(char x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(char x)
+{
+ double r = convert_double(x);
+ char y = convert_char(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(char2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(char2 x)
+{
+ double2 r = convert_double2(x);
+ char2 y = convert_char2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(char3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(char3 x)
+{
+ double3 r = convert_double3(x);
+ char3 y = convert_char3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(char4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(char4 x)
+{
+ double4 r = convert_double4(x);
+ char4 y = convert_char4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(char8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(char8 x)
+{
+ double8 r = convert_double8(x);
+ char8 y = convert_char8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(char16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(char16 x)
+{
+ double16 r = convert_double16(x);
+ char16 y = convert_char16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(uchar x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(uchar x)
+{
+ float r = convert_float(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(uchar2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(uchar2 x)
+{
+ float2 r = convert_float2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(uchar3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(uchar3 x)
+{
+ float3 r = convert_float3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(uchar4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(uchar4 x)
+{
+ float4 r = convert_float4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(uchar8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(uchar8 x)
+{
+ float8 r = convert_float8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(uchar16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(uchar16 x)
+{
+ float16 r = convert_float16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ uchar abs_x = abs(x);
+ uchar abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(uchar x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(uchar x)
+{
+ double r = convert_double(x);
+ uchar y = convert_uchar(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ uchar2 abs_x = abs(x);
+ uchar2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(uchar2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(uchar2 x)
+{
+ double2 r = convert_double2(x);
+ uchar2 y = convert_uchar2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ uchar3 abs_x = abs(x);
+ uchar3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(uchar3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(uchar3 x)
+{
+ double3 r = convert_double3(x);
+ uchar3 y = convert_uchar3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ uchar4 abs_x = abs(x);
+ uchar4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(uchar4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(uchar4 x)
+{
+ double4 r = convert_double4(x);
+ uchar4 y = convert_uchar4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ uchar8 abs_x = abs(x);
+ uchar8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(uchar8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(uchar8 x)
+{
+ double8 r = convert_double8(x);
+ uchar8 y = convert_uchar8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ uchar16 abs_x = abs(x);
+ uchar16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(uchar16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(uchar16 x)
+{
+ double16 r = convert_double16(x);
+ uchar16 y = convert_uchar16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(short x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(short x)
+{
+ float r = convert_float(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(short2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(short2 x)
+{
+ float2 r = convert_float2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(short3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(short3 x)
+{
+ float3 r = convert_float3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(short4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(short4 x)
+{
+ float4 r = convert_float4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(short8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(short8 x)
+{
+ float8 r = convert_float8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(short16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(short16 x)
+{
+ float16 r = convert_float16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(short x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(short x)
+{
+ double r = convert_double(x);
+ short y = convert_short(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(short2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(short2 x)
+{
+ double2 r = convert_double2(x);
+ short2 y = convert_short2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(short3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(short3 x)
+{
+ double3 r = convert_double3(x);
+ short3 y = convert_short3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(short4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(short4 x)
+{
+ double4 r = convert_double4(x);
+ short4 y = convert_short4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(short8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(short8 x)
+{
+ double8 r = convert_double8(x);
+ short8 y = convert_short8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(short16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(short16 x)
+{
+ double16 r = convert_double16(x);
+ short16 y = convert_short16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(ushort x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(ushort x)
+{
+ float r = convert_float(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(ushort2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(ushort2 x)
+{
+ float2 r = convert_float2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(ushort3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(ushort3 x)
+{
+ float3 r = convert_float3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(ushort4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(ushort4 x)
+{
+ float4 r = convert_float4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(ushort8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(ushort8 x)
+{
+ float8 r = convert_float8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(ushort16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(ushort16 x)
+{
+ float16 r = convert_float16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ ushort abs_x = abs(x);
+ ushort abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(ushort x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(ushort x)
+{
+ double r = convert_double(x);
+ ushort y = convert_ushort(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ ushort2 abs_x = abs(x);
+ ushort2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(ushort2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(ushort2 x)
+{
+ double2 r = convert_double2(x);
+ ushort2 y = convert_ushort2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ ushort3 abs_x = abs(x);
+ ushort3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(ushort3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(ushort3 x)
+{
+ double3 r = convert_double3(x);
+ ushort3 y = convert_ushort3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ ushort4 abs_x = abs(x);
+ ushort4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(ushort4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(ushort4 x)
+{
+ double4 r = convert_double4(x);
+ ushort4 y = convert_ushort4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ ushort8 abs_x = abs(x);
+ ushort8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(ushort8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(ushort8 x)
+{
+ double8 r = convert_double8(x);
+ ushort8 y = convert_ushort8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ ushort16 abs_x = abs(x);
+ ushort16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(ushort16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(ushort16 x)
+{
+ double16 r = convert_double16(x);
+ ushort16 y = convert_ushort16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(int x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(int x)
+{
+ float r = convert_float(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(int2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(int2 x)
+{
+ float2 r = convert_float2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(int3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(int3 x)
+{
+ float3 r = convert_float3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(int4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(int4 x)
+{
+ float4 r = convert_float4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(int8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(int8 x)
+{
+ float8 r = convert_float8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(int16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(int16 x)
+{
+ float16 r = convert_float16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(int x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(int x)
+{
+ double r = convert_double(x);
+ int y = convert_int(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(int2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(int2 x)
+{
+ double2 r = convert_double2(x);
+ int2 y = convert_int2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(int3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(int3 x)
+{
+ double3 r = convert_double3(x);
+ int3 y = convert_int3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(int4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(int4 x)
+{
+ double4 r = convert_double4(x);
+ int4 y = convert_int4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(int8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(int8 x)
+{
+ double8 r = convert_double8(x);
+ int8 y = convert_int8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(int16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(int16 x)
+{
+ double16 r = convert_double16(x);
+ int16 y = convert_int16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(uint x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(uint x)
+{
+ float r = convert_float(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(uint2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(uint2 x)
+{
+ float2 r = convert_float2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(uint3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(uint3 x)
+{
+ float3 r = convert_float3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(uint4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(uint4 x)
+{
+ float4 r = convert_float4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(uint8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(uint8 x)
+{
+ float8 r = convert_float8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(uint16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(uint16 x)
+{
+ float16 r = convert_float16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ uint abs_x = abs(x);
+ uint abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(uint x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(uint x)
+{
+ double r = convert_double(x);
+ uint y = convert_uint(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ uint2 abs_x = abs(x);
+ uint2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(uint2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(uint2 x)
+{
+ double2 r = convert_double2(x);
+ uint2 y = convert_uint2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ uint3 abs_x = abs(x);
+ uint3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(uint3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(uint3 x)
+{
+ double3 r = convert_double3(x);
+ uint3 y = convert_uint3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ uint4 abs_x = abs(x);
+ uint4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(uint4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(uint4 x)
+{
+ double4 r = convert_double4(x);
+ uint4 y = convert_uint4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ uint8 abs_x = abs(x);
+ uint8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(uint8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(uint8 x)
+{
+ double8 r = convert_double8(x);
+ uint8 y = convert_uint8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ uint16 abs_x = abs(x);
+ uint16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(uint16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(uint16 x)
+{
+ double16 r = convert_double16(x);
+ uint16 y = convert_uint16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(long x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(long x)
+{
+ float r = convert_float(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(long2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(long2 x)
+{
+ float2 r = convert_float2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(long3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(long3 x)
+{
+ float3 r = convert_float3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(long4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(long4 x)
+{
+ float4 r = convert_float4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(long8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(long8 x)
+{
+ float8 r = convert_float8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(long16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(long16 x)
+{
+ float16 r = convert_float16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(long x)
+{
+ return convert_double(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(long x)
+{
+ double r = convert_double(x);
+ long y = convert_long(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(long2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(long2 x)
+{
+ double2 r = convert_double2(x);
+ long2 y = convert_long2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(long3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(long3 x)
+{
+ double3 r = convert_double3(x);
+ long3 y = convert_long3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(long4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(long4 x)
+{
+ double4 r = convert_double4(x);
+ long4 y = convert_long4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(long8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(long8 x)
+{
+ double8 r = convert_double8(x);
+ long8 y = convert_long8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(long16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(long16 x)
+{
+ double16 r = convert_double16(x);
+ long16 y = convert_long16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(ulong x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(ulong x)
+{
+ float r = convert_float(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(ulong2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(ulong2 x)
+{
+ float2 r = convert_float2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(ulong3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(ulong3 x)
+{
+ float3 r = convert_float3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(ulong4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(ulong4 x)
+{
+ float4 r = convert_float4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(ulong8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(ulong8 x)
+{
+ float8 r = convert_float8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(ulong16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cles_khr_int64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(ulong16 x)
+{
+ float16 r = convert_float16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ ulong abs_x = abs(x);
+ ulong abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(ulong x)
+{
+ return convert_double(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(ulong x)
+{
+ double r = convert_double(x);
+ ulong y = convert_ulong(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ ulong2 abs_x = abs(x);
+ ulong2 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(ulong2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(ulong2 x)
+{
+ double2 r = convert_double2(x);
+ ulong2 y = convert_ulong2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ ulong3 abs_x = abs(x);
+ ulong3 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(ulong3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(ulong3 x)
+{
+ double3 r = convert_double3(x);
+ ulong3 y = convert_ulong3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ ulong4 abs_x = abs(x);
+ ulong4 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(ulong4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(ulong4 x)
+{
+ double4 r = convert_double4(x);
+ ulong4 y = convert_ulong4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ ulong8 abs_x = abs(x);
+ ulong8 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(ulong8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(ulong8 x)
+{
+ double8 r = convert_double8(x);
+ ulong8 y = convert_ulong8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ ulong16 abs_x = abs(x);
+ ulong16 abs_y = abs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(ulong16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#if defined(cl_khr_fp64) && defined(cles_khr_int64)
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(ulong16 x)
+{
+ double16 r = convert_double16(x);
+ ulong16 y = convert_ulong16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ float abs_x = fabs(x);
+ float abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(float x)
+{
+ return convert_float(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(float x)
+{
+ float r = convert_float(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ float2 abs_x = fabs(x);
+ float2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(float2 x)
+{
+ return convert_float2(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(float2 x)
+{
+ float2 r = convert_float2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ float3 abs_x = fabs(x);
+ float3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(float3 x)
+{
+ return convert_float3(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(float3 x)
+{
+ float3 r = convert_float3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ float4 abs_x = fabs(x);
+ float4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(float4 x)
+{
+ return convert_float4(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(float4 x)
+{
+ float4 r = convert_float4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ float8 abs_x = fabs(x);
+ float8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(float8 x)
+{
+ return convert_float8(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(float8 x)
+{
+ float8 r = convert_float8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ float16 abs_x = fabs(x);
+ float16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(float16 x)
+{
+ return convert_float16(x);
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(float16 x)
+{
+ float16 r = convert_float16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ float abs_x = fabs(x);
+ float abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(float x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(float x)
+{
+ double r = convert_double(x);
+ float y = convert_float(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ float2 abs_x = fabs(x);
+ float2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(float2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(float2 x)
+{
+ double2 r = convert_double2(x);
+ float2 y = convert_float2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ float3 abs_x = fabs(x);
+ float3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(float3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(float3 x)
+{
+ double3 r = convert_double3(x);
+ float3 y = convert_float3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ float4 abs_x = fabs(x);
+ float4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(float4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(float4 x)
+{
+ double4 r = convert_double4(x);
+ float4 y = convert_float4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ float8 abs_x = fabs(x);
+ float8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(float8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(float8 x)
+{
+ double8 r = convert_double8(x);
+ float8 y = convert_float8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ float16 abs_x = fabs(x);
+ float16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(float16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(float16 x)
+{
+ double16 r = convert_double16(x);
+ float16 y = convert_float16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtz(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ double abs_x = fabs(x);
+ double abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rte(double x)
+{
+ return convert_float(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtp(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float convert_float_rtn(double x)
+{
+ float r = convert_float(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtz(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ double2 abs_x = fabs(x);
+ double2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rte(double2 x)
+{
+ return convert_float2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtp(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float2 convert_float2_rtn(double2 x)
+{
+ float2 r = convert_float2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtz(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ double3 abs_x = fabs(x);
+ double3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rte(double3 x)
+{
+ return convert_float3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtp(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float3 convert_float3_rtn(double3 x)
+{
+ float3 r = convert_float3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtz(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ double4 abs_x = fabs(x);
+ double4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rte(double4 x)
+{
+ return convert_float4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtp(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float4 convert_float4_rtn(double4 x)
+{
+ float4 r = convert_float4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtz(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ double8 abs_x = fabs(x);
+ double8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rte(double8 x)
+{
+ return convert_float8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtp(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float8 convert_float8_rtn(double8 x)
+{
+ float8 r = convert_float8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtz(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ double16 abs_x = fabs(x);
+ double16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rte(double16 x)
+{
+ return convert_float16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtp(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+float16 convert_float16_rtn(double16 x)
+{
+ float16 r = convert_float16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtz(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ double abs_x = fabs(x);
+ double abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rte(double x)
+{
+ return convert_double(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtp(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double convert_double_rtn(double x)
+{
+ double r = convert_double(x);
+ double y = convert_double(y);
+ return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtz(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ double2 abs_x = fabs(x);
+ double2 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rte(double2 x)
+{
+ return convert_double2(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtp(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double2 convert_double2_rtn(double2 x)
+{
+ double2 r = convert_double2(x);
+ double2 y = convert_double2(y);
+ return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtz(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ double3 abs_x = fabs(x);
+ double3 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rte(double3 x)
+{
+ return convert_double3(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtp(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double3 convert_double3_rtn(double3 x)
+{
+ double3 r = convert_double3(x);
+ double3 y = convert_double3(y);
+ return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtz(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ double4 abs_x = fabs(x);
+ double4 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rte(double4 x)
+{
+ return convert_double4(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtp(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double4 convert_double4_rtn(double4 x)
+{
+ double4 r = convert_double4(x);
+ double4 y = convert_double4(y);
+ return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtz(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ double8 abs_x = fabs(x);
+ double8 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rte(double8 x)
+{
+ return convert_double8(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtp(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double8 convert_double8_rtn(double8 x)
+{
+ double8 r = convert_double8(x);
+ double8 y = convert_double8(y);
+ return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtz(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ double16 abs_x = fabs(x);
+ double16 abs_y = fabs(y);
+ return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rte(double16 x)
+{
+ return convert_double16(x);
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtp(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+}
+#endif
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD
+double16 convert_double16_rtn(double16 x)
+{
+ double16 r = convert_double16(x);
+ double16 y = convert_double16(y);
+ return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+}
+#endif
+
+
+#endif // ASW
diff --git a/src/builtins/cross.cl b/src/builtins/cross.cl
new file mode 100644
index 0000000..a3e019f
--- /dev/null
+++ b/src/builtins/cross.cl
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2011-2013, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float3 cross(float3 p0, float3 p1)
+{
+ return (float3)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 cross(float4 p0, float4 p1)
+{
+ return (float4)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x,
+ 0.f);
+}
+
+_CLC_OVERLOAD _CLC_DEF double3 cross(double3 p0, double3 p1)
+{
+ return (double3)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x);
+}
+
+_CLC_OVERLOAD _CLC_DEF double4 cross(double4 p0, double4 p1)
+{
+ return (double4)(p0.y*p1.z - p0.z*p1.y,
+ p0.z*p1.x - p0.x*p1.z,
+ p0.x*p1.y - p0.y*p1.x,
+ 0.);
+}
diff --git a/src/builtins/degrees.cl b/src/builtins/degrees.cl
new file mode 100644
index 0000000..329e0f1
--- /dev/null
+++ b/src/builtins/degrees.cl
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3)) \
+ IMPLEMENTATION(_VEC_TYPE(type,4)) \
+ IMPLEMENTATION(_VEC_TYPE(type,8)) \
+ IMPLEMENTATION(_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_DEF gentype degrees(gentype radians) { return radians * (gentype)180.0 * (gentype)M_1_PI; } \
+_CLC_OVERLOAD _CLC_DEF gentype radians(gentype degrees) { return degrees * (gentype)M_PI / (gentype)180.0; }
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/dot.cl b/src/builtins/dot.cl
new file mode 100644
index 0000000..0b16d66
--- /dev/null
+++ b/src/builtins/dot.cl
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float dot(float3 p0, float3 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; }
+
+_CLC_OVERLOAD _CLC_DEF float dot(float4 p0, float4 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; }
+
+_CLC_OVERLOAD _CLC_DEF double dot(double3 p0, double3 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z; }
+
+_CLC_OVERLOAD _CLC_DEF double dot(double4 p0, double4 p1)
+{ return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w; }
+
diff --git a/src/builtins/fract.cl b/src/builtins/fract.cl
new file mode 100644
index 0000000..11f08e8
--- /dev/null
+++ b/src/builtins/fract.cl
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define SCALAR(type, the_max) \
+{ \
+ type the_floor = floor(x); \
+ *ptr = the_floor; \
+ if (isnan(x)) return x; \
+ return fmin(x - the_floor, (type) (the_max)); \
+} \
+
+#define BODY(type, the_max) \
+{ \
+ type the_floor = floor(x); \
+ *ptr = the_floor; \
+ type result = fmin(x - the_floor, (type) (the_max)); \
+ return select(result, x, isnan(x)); \
+} \
+
+_CLC_OVERLOAD _CLC_DEF float fract(float x, global float * ptr) SCALAR(float, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float fract(float x, local float * ptr) SCALAR(float, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float fract(float x, private float * ptr) SCALAR(float, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, global float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, local float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float2 fract(float2 x, private float2 * ptr) BODY(float2, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, global float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, local float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float3 fract(float3 x, private float3 * ptr) BODY(float3, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, global float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, local float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float4 fract(float4 x, private float4 * ptr) BODY(float4, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, global float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, local float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float8 fract(float8 x, private float8 * ptr) BODY(float8, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, global float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, local float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+_CLC_OVERLOAD _CLC_DEF float16 fract(float16 x, private float16 * ptr) BODY(float16, 0x1.fffffep-1f)
+
+_CLC_OVERLOAD _CLC_DEF double fract(double x, global double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double fract(double x, local double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double fract(double x, private double * ptr) SCALAR(double, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, global double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, local double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double2 fract(double2 x, private double2 * ptr) BODY(double2, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, global double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, local double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double3 fract(double3 x, private double3 * ptr) BODY(double3, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, global double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, local double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double4 fract(double4 x, private double4 * ptr) BODY(double4, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, global double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, local double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double8 fract(double8 x, private double8 * ptr) BODY(double8, 0x1.fffffffffffffp-1)
+
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, global double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, local double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+_CLC_OVERLOAD _CLC_DEF double16 fract(double16 x, private double16 * ptr) BODY(double16, 0x1.fffffffffffffp-1)
+
diff --git a/src/builtins/frexp.cl b/src/builtins/frexp.cl
new file mode 100644
index 0000000..e02cf90
--- /dev/null
+++ b/src/builtins/frexp.cl
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, global int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, local int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float frexp(float x, private int * ptr) SCALAR_BODY(float, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float2 frexp(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float3 frexp(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float4 frexp(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float8 frexp(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+_CLC_OVERLOAD _CLC_DEF float16 frexp(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, __builtin_frexpf, int)
+
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, global int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, local int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double frexp(double x, private int * ptr) SCALAR_BODY(double, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double2 frexp(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double3 frexp(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double4 frexp(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double8 frexp(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, __builtin_frexp, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
+_CLC_OVERLOAD _CLC_DEF double16 frexp(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, __builtin_frexp, int)
diff --git a/src/builtins/hadd.cl b/src/builtins/hadd.cl
new file mode 100644
index 0000000..c96324f
--- /dev/null
+++ b/src/builtins/hadd.cl
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,2)) \
+ IMPLEMENTATION(_VEC_TYPE(type,3)) \
+ IMPLEMENTATION(_VEC_TYPE(type,4)) \
+ IMPLEMENTATION(_VEC_TYPE(type,8)) \
+ IMPLEMENTATION(_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+ _CLC_OVERLOAD _CLC_DEF gentype hadd(gentype x, gentype y) \
+ { return (x >> (gentype)1) + (y >> (gentype)1) + (x & y & (gentype)1); } \
+ _CLC_OVERLOAD _CLC_DEF gentype rhadd(gentype x, gentype y) \
+ { return (x >> (gentype)1) + (y >> (gentype)1) + ((x&(gentype)1)|(y&(gentype)1)); } \
+
+_EXPAND_INTEGER_TYPES()
diff --git a/src/builtins/length.cl b/src/builtins/length.cl
new file mode 100644
index 0000000..2cfefa1
--- /dev/null
+++ b/src/builtins/length.cl
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float length(float2 p)
+{
+ float r;
+ p = fabs(p);
+ if (p.x > p.y)
+ {
+ r = p.y/p.x;
+ return p.x * sqrt(1+r*r);
+ }
+ else if (p.y != 0)
+ {
+ r = p.x/p.y;
+ return p.y * sqrt(1+r*r);
+ }
+ return 0.0;
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double2 p)
+{
+ double r;
+ p = fabs(p);
+ if (p.x > p.y)
+ {
+ r = p.y/p.x;
+ return p.x * sqrt(1+r*r);
+ }
+ else if (p.y != 0)
+ {
+ r = p.x/p.y;
+ return p.y * sqrt(1+r*r);
+ }
+ return 0.0;
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float3 p)
+{
+ p = fabs(p);
+ float max_term = max(p.x, max(p.y, p.z));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double3 p)
+{
+ p = fabs(p);
+ double max_term = max(p.x, max(p.y, p.z));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float length(float4 p)
+{
+ p = fabs(p);
+ float max_term = max(max(p.x, p.y), max(p.z, p.w));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF double length(double4 p)
+{
+ p = fabs(p);
+ double max_term = max(max(p.x, p.y), max(p.z, p.w));
+ if (max_term == 0 || isinf(max_term) ) return max_term;
+ if (max_term < 1) return fast_length(p);
+ p /= max_term;
+ return max_term * sqrt(dot(p,p));
+}
+
+_CLC_OVERLOAD _CLC_DEF float fast_length(float2 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF float fast_length(float3 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF float fast_length(float4 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double2 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double3 p) { return sqrt(dot(p,p));}
+_CLC_OVERLOAD _CLC_DEF double fast_length(double4 p) { return sqrt(dot(p,p));}
diff --git a/src/builtins/lgamma_r.cl b/src/builtins/lgamma_r.cl
new file mode 100644
index 0000000..aa3d487
--- /dev/null
+++ b/src/builtins/lgamma_r.cl
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED float lgammaf_r(float x, int * ptr);
+_CLC_PROTECTED double builtin_lgamma_r(double x, int * ptr);
+
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, global int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, local int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float lgamma_r(float x, private int * ptr) SCALAR_BODY(float, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, global int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, local int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float2 lgamma_r(float2 x, private int2 * ptr) VECTOR_BODY(float, 2, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, global int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, local int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float3 lgamma_r(float3 x, private int3 * ptr) VECTOR_BODY(float, 3, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, global int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, local int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float4 lgamma_r(float4 x, private int4 * ptr) VECTOR_BODY(float, 4, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, global int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, local int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float8 lgamma_r(float8 x, private int8 * ptr) VECTOR_BODY(float, 8, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, global int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, local int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+_CLC_OVERLOAD _CLC_DEF float16 lgamma_r(float16 x, private int16 * ptr) VECTOR_BODY(float, 16, lgammaf_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, global int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, local int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double lgamma_r(double x, private int * ptr) SCALAR_BODY(double, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, global int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, local int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double2 lgamma_r(double2 x, private int2 * ptr) VECTOR_BODY(double, 2, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, global int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, local int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double3 lgamma_r(double3 x, private int3 * ptr) VECTOR_BODY(double, 3, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, global int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, local int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double4 lgamma_r(double4 x, private int4 * ptr) VECTOR_BODY(double, 4, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, global int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, local int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double8 lgamma_r(double8 x, private int8 * ptr) VECTOR_BODY(double, 8, builtin_lgamma_r, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, global int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, local int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+_CLC_OVERLOAD _CLC_DEF double16 lgamma_r(double16 x, private int16 * ptr) VECTOR_BODY(double, 16, builtin_lgamma_r, int)
+
diff --git a/src/builtins/mad_sat.cl b/src/builtins/mad_sat.cl
new file mode 100644
index 0000000..ac79a86
--- /dev/null
+++ b/src/builtins/mad_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+TERNARY_VEC_DEF(char, char, mad_sat, mad_sat)
+TERNARY_VEC_DEF(uchar, uchar, mad_sat, mad_sat)
+TERNARY_VEC_DEF(short, short, mad_sat, mad_sat)
+TERNARY_VEC_DEF(ushort, ushort,mad_sat, mad_sat)
+TERNARY_VEC_DEF(int, int, mad_sat, mad_sat)
+TERNARY_VEC_DEF(uint, uint, mad_sat, mad_sat)
+TERNARY_VEC_DEF(long, long, mad_sat, mad_sat)
+TERNARY_VEC_DEF(ulong, ulong, mad_sat, mad_sat)
diff --git a/src/builtins/math.cl b/src/builtins/math.cl
new file mode 100644
index 0000000..02db08b
--- /dev/null
+++ b/src/builtins/math.cl
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define UNARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x) { return (float)__builtin_##function(x); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x) { return __builtin_##function(x); } \
+UNARY_VEC_DEF(float, float, function, function##f) \
+UNARY_VEC_DEF(double, double, function, function##d) \
+
+#define UNARY_ALT(utype, function) \
+_CLC_PROTECTED _CLC_INLINE utype function##f(float x) { return __builtin_##function(x); } \
+_CLC_PROTECTED _CLC_INLINE utype function##d(double x) { return __builtin_##function(x); } \
+UNARY_VEC_DEF(float, utype, function, function##f) \
+UNARY_VEC_DEF(double, utype, function, function##d) \
+
+#define UNARY_NO_BUILTIN(function) \
+UNARY_VEC_DEF(float, float, function, function) \
+UNARY_VEC_DEF(double, double, function, function) \
+
+#define BINARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y) { return (float)__builtin_##function(x,y); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y) { return __builtin_##function(x,y); } \
+BINARY_VEC_DEF(float, float, function, function) \
+BINARY_VEC_DEF(double, double, function, function) \
+
+#define BINARY_NO_BUILTIN(function) \
+BINARY_VEC_DEF(float, float, function, function) \
+BINARY_VEC_DEF(double, double, function, function) \
+
+#define TERNARY(function) \
+_CLC_PROTECTED _CLC_INLINE float function##f(float x, float y, float z) { return (float)__builtin_##function(x,y,z); } \
+_CLC_PROTECTED _CLC_INLINE double function##d(double x, double y, double z) { return __builtin_##function(x,y,z); } \
+TERNARY_VEC_DEF(float, float, function, function) \
+TERNARY_VEC_DEF(double, double, function, function) \
+
+#define TERNARY_NO_BUILTIN(function) \
+TERNARY_VEC_DEF(float, float, function, function) \
+TERNARY_VEC_DEF(double, double, function, function) \
+
+/*-------------------------------------------------------------------------
+* Prototypes for the math builtins
+*------------------------------------------------------------------------*/
+UNARY(acos)
+UNARY(acosh)
+UNARY_NO_BUILTIN(acospi)
+UNARY(asin)
+UNARY(asinh)
+UNARY_NO_BUILTIN(asinpi)
+UNARY(atan)
+BINARY_NO_BUILTIN(atan2pi)
+UNARY(atanh)
+UNARY_NO_BUILTIN(atanpi)
+BINARY(atan2)
+UNARY(cbrt)
+UNARY(ceil)
+UNARY(cos)
+BINARY(copysign)
+UNARY(cosh)
+UNARY_NO_BUILTIN(cospi)
+UNARY(erf)
+UNARY(erfc)
+UNARY(exp)
+UNARY(exp2)
+UNARY_NO_BUILTIN(exp10)
+UNARY(expm1)
+UNARY(fabs)
+BINARY(fdim)
+UNARY(floor)
+TERNARY(fma)
+BINARY(fmax)
+BINARY(fmin)
+BINARY(fmod)
+BINARY(hypot)
+
+UNARY_ALT(int, ilogb)
+
+BINARY_VEC_DEF_ALT(float, float, int, ldexp, ldexpf)
+BINARY_VEC_DEF_ALT(double, double, int, ldexp, ldexp)
+
+UNARY(lgamma)
+UNARY(log)
+UNARY(log2)
+UNARY(log10)
+UNARY(log1p)
+UNARY(logb)
+TERNARY_NO_BUILTIN(mad)
+BINARY_NO_BUILTIN(maxmag)
+BINARY_NO_BUILTIN(minmag)
+
+UNARY_VEC_DEF(uint, float, nan, nan)
+UNARY_VEC_DEF(ulong, double, nan, nan)
+
+BINARY(nextafter)
+BINARY(pow)
+
+BINARY_VEC_DEF_ALT(float, float, int, pown, powf)
+BINARY_VEC_DEF_ALT(double, double, int, pown, builtin_pow)
+
+BINARY_NO_BUILTIN(powr)
+BINARY(remainder)
+UNARY(rint)
+
+BINARY_VEC_DEF_ALT(float, float, int, rootn, builtin_rootnf)
+BINARY_VEC_DEF_ALT(double, double, int, rootn, builtin_rootn)
+
+UNARY(round)
+UNARY_NO_BUILTIN(rsqrt)
+UNARY(sin)
+UNARY(sinh)
+UNARY_NO_BUILTIN(sinpi)
+UNARY(sqrt)
+UNARY(tan)
+UNARY(tanh)
+UNARY_NO_BUILTIN(tanpi)
+UNARY(tgamma)
+UNARY(trunc)
+
+/*-------------------------------------------------------------------------
+* Half functions:
+*------------------------------------------------------------------------*/
+
+BINARY_NO_BUILTIN(half_divide)
+UNARY_NO_BUILTIN(half_recip)
+
+
diff --git a/src/builtins/max.cl b/src/builtins/max.cl
new file mode 100644
index 0000000..9605490
--- /dev/null
+++ b/src/builtins/max.cl
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, gentype y) \
+ { return y < x ? y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype min(gentype x, sgentype y) \
+ { return (gentype)y < x ? (gentype)y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, gentype y) \
+ { return y > x ? y : x; } \
+_CLC_OVERLOAD _CLC_DEF gentype max(gentype x, sgentype y) \
+ { return (gentype)y > x ? (gentype)y : x; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/misc.cl b/src/builtins/misc.cl
new file mode 100644
index 0000000..aba5efa
--- /dev/null
+++ b/src/builtins/misc.cl
@@ -0,0 +1,36 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+_CLC_PROTECTED void __mfence(void);
+//_CLC_PROTECTED void barrier(cl_mem_fence_flags flags) { }
+_CLC_PROTECTED void mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+_CLC_PROTECTED void read_mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+_CLC_PROTECTED void write_mem_fence(cl_mem_fence_flags flags) { __mfence(); }
+
diff --git a/src/builtins/mix.cl b/src/builtins/mix.cl
new file mode 100644
index 0000000..9f339aa
--- /dev/null
+++ b/src/builtins/mix.cl
@@ -0,0 +1,42 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION (_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, gentype a) \
+ { return x + (y-x) * a; } \
+_CLC_OVERLOAD _CLC_DEF gentype mix(gentype x, gentype y, sgentype a) \
+ { return x + (y-x) * (gentype)a; } \
+
+_EXPAND_TYPES()
diff --git a/src/builtins/modf.cl b/src/builtins/modf.cl
new file mode 100644
index 0000000..cf0aae7
--- /dev/null
+++ b/src/builtins/modf.cl
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED float modff(float x, float * iptr);
+_CLC_PROTECTED double builtin_modf(double x, double * iptr);
+
+
+_CLC_OVERLOAD _CLC_DEF float modf(float x, global float * ptr) SCALAR_BODY(float, modff, float)
+_CLC_OVERLOAD _CLC_DEF float modf(float x, local float * ptr) SCALAR_BODY(float, modff, float)
+_CLC_OVERLOAD _CLC_DEF float modf(float x, private float * ptr) SCALAR_BODY(float, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, global float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, local float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+_CLC_OVERLOAD _CLC_DEF float2 modf(float2 x, private float2 * ptr) VECTOR_BODY(float, 2, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, global float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, local float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+_CLC_OVERLOAD _CLC_DEF float3 modf(float3 x, private float3 * ptr) VECTOR_BODY(float, 3, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, global float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, local float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+_CLC_OVERLOAD _CLC_DEF float4 modf(float4 x, private float4 * ptr) VECTOR_BODY(float, 4, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, global float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, local float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+_CLC_OVERLOAD _CLC_DEF float8 modf(float8 x, private float8 * ptr) VECTOR_BODY(float, 8, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, global float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, local float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+_CLC_OVERLOAD _CLC_DEF float16 modf(float16 x, private float16 * ptr) VECTOR_BODY(float, 16, modff, float)
+
+_CLC_OVERLOAD _CLC_DEF double modf(double x, global double * ptr) SCALAR_BODY(double, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double modf(double x, local double * ptr) SCALAR_BODY(double, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double modf(double x, private double * ptr) SCALAR_BODY(double, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, global double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, local double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double2 modf(double2 x, private double2 * ptr) VECTOR_BODY(double, 2, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, global double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, local double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double3 modf(double3 x, private double3 * ptr) VECTOR_BODY(double, 3, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, global double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, local double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double4 modf(double4 x, private double4 * ptr) VECTOR_BODY(double, 4, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, global double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, local double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double8 modf(double8 x, private double8 * ptr) VECTOR_BODY(double, 8, builtin_modf, double)
+
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, global double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, local double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+_CLC_OVERLOAD _CLC_DEF double16 modf(double16 x, private double16 * ptr) VECTOR_BODY(double, 16, builtin_modf, double)
+
diff --git a/src/builtins/mul_hi.cl b/src/builtins/mul_hi.cl
new file mode 100644
index 0000000..5b3368e
--- /dev/null
+++ b/src/builtins/mul_hi.cl
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+//FOIL-based long mul_hi
+//
+// Summary: Treat mul_hi(long x, long y) as:
+// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
+// and b and d are the low-order parts of x and y.
+// Thinking back to algebra, we use FOIL to do the work.
+
+_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
+ long f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ long x_hi = x >> 32;
+ long x_lo = x & UINT_MAX;
+ long y_hi = y >> 32;
+ long y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together in the following steps:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y)
+{
+ ulong f, o, i;
+ ulong l;
+
+ //Move the high/low halves of x/y into the lower 32-bits of variables so
+ //that we can multiply them without worrying about overflow.
+ ulong x_hi = x >> 32;
+ ulong x_lo = x & UINT_MAX;
+ ulong y_hi = y >> 32;
+ ulong y_lo = y & UINT_MAX;
+
+ //Multiply all of the components according to FOIL method
+ f = x_hi * y_hi;
+ o = x_hi * y_lo;
+ i = x_lo * y_hi;
+ l = x_lo * y_lo;
+
+ //Now add the components back together, taking care to respect the fact that:
+ //F: doesn't need to be modified
+ //O/I: Need to be added together.
+ //L: Shift right by 32-bits, then add into the sum of O and I
+ //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+ //
+ //We use hadd to give us a bit of extra precision for the intermediate sums
+ //but as a result, we shift by 31 bits instead of 32
+ return (f + (hadd(o, (i + (l>>32))) >> 31));
+}
+
+BINARY_VEC_DEF(char, char, mul_hi, mul_hi)
+BINARY_VEC_DEF(uchar, uchar, mul_hi, mul_hi)
+BINARY_VEC_DEF(short, short, mul_hi, mul_hi)
+BINARY_VEC_DEF(ushort, ushort,mul_hi, mul_hi)
+BINARY_VEC_DEF(int, int, mul_hi, mul_hi)
+BINARY_VEC_DEF(uint, uint, mul_hi, mul_hi)
+BINARY_VEC_DEF(long, long, mul_hi, mul_hi)
+BINARY_VEC_DEF(ulong, ulong, mul_hi, mul_hi)
diff --git a/src/builtins/relationals.cl b/src/builtins/relationals.cl
new file mode 100644
index 0000000..a1d6830
--- /dev/null
+++ b/src/builtins/relationals.cl
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+UNARY_VEC_DEF(float, int, isnan, -isnan)
+UNARY_VEC_DEF(double, long, isnan, -isnan)
+
+UNARY_VEC_DEF(float, int, isfinite, -isfinite)
+UNARY_VEC_DEF(double, long, isfinite, -isfinite)
+
+UNARY_VEC_DEF(float, int, isinf, -isinf)
+UNARY_VEC_DEF(double, long, isinf, -isinf)
+
+UNARY_VEC_DEF(float, int, isnormal, -isnormal)
+UNARY_VEC_DEF(double, long, isnormal, -isnormal)
+
+UNARY_VEC_DEF(float, int, signbit, -signbit)
+UNARY_VEC_DEF(double, long, signbit, -signbit)
+
+BINARY_VEC_DEF(float, int, isequal, -isequal)
+BINARY_VEC_DEF(double, long, isequal, -isequal)
+
+BINARY_VEC_DEF(float, int, isnotequal, -isnotequal)
+BINARY_VEC_DEF(double, long, isnotequal, -isnotequal)
+
+BINARY_VEC_DEF(float, int, isless, -isless)
+BINARY_VEC_DEF(double, long, isless, -isless)
+
+BINARY_VEC_DEF(float, int, islessequal, -islessequal)
+BINARY_VEC_DEF(double, long, islessequal, -islessequal)
+
+BINARY_VEC_DEF(float, int, isgreater, -isgreater)
+BINARY_VEC_DEF(double, long, isgreater, -isgreater)
+
+BINARY_VEC_DEF(float, int, isgreaterequal, -isgreaterequal)
+BINARY_VEC_DEF(double, long, isgreaterequal, -isgreaterequal)
+
+BINARY_VEC_DEF(float, int, islessgreater, -islessgreater)
+BINARY_VEC_DEF(double, long, islessgreater, -islessgreater)
diff --git a/src/builtins/remquo.cl b/src/builtins/remquo.cl
new file mode 100644
index 0000000..1bc5094
--- /dev/null
+++ b/src/builtins/remquo.cl
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define REMQUO_SCALAR_BODY(type, op, ptr_type) \
+{ \
+ ptr_type temp; \
+ type result = op(x, y, &temp); \
+ *ptr = temp; \
+ return result; \
+} \
+
+#define REMQUO_VECTOR_BODY_2(op, ptr_type) \
+ temp.s0 = op(x.s0 ,y.s0, &(((ptr_type*)&itemp)[0])); \
+ temp.s1 = op(x.s1 ,y.s1, &(((ptr_type*)&itemp)[1])); \
+
+#define REMQUO_VECTOR_BODY_3(op, ptr_type) \
+ REMQUO_VECTOR_BODY_2(op, ptr_type) \
+ temp.s2 = op(x.s2 ,y.s2, &(((ptr_type*)&itemp)[2])); \
+
+#define REMQUO_VECTOR_BODY_4(op, ptr_type) \
+ REMQUO_VECTOR_BODY_3(op, ptr_type) \
+ temp.s3 = op(x.s3 ,y.s3, &(((ptr_type*)&itemp)[3])); \
+
+#define REMQUO_VECTOR_BODY_8(op, ptr_type) \
+ REMQUO_VECTOR_BODY_4(op, ptr_type) \
+ temp.s4 = op(x.s4 ,y.s4, &(((ptr_type*)&itemp)[4])); \
+ temp.s5 = op(x.s5 ,y.s5, &(((ptr_type*)&itemp)[5])); \
+ temp.s6 = op(x.s6 ,y.s6, &(((ptr_type*)&itemp)[6])); \
+ temp.s7 = op(x.s7 ,y.s7, &(((ptr_type*)&itemp)[7])); \
+
+#define REMQUO_VECTOR_BODY_16(op, ptr_type) \
+ REMQUO_VECTOR_BODY_8(op, ptr_type) \
+ temp.s8 = op(x.s8 ,y.s8, &(((ptr_type*)&itemp)[8])); \
+ temp.s9 = op(x.s9 ,y.s9, &(((ptr_type*)&itemp)[9])); \
+ temp.sa = op(x.sa ,y.sa, &(((ptr_type*)&itemp)[10])); \
+ temp.sb = op(x.sb ,y.sb, &(((ptr_type*)&itemp)[11])); \
+ temp.sc = op(x.sc ,y.sc, &(((ptr_type*)&itemp)[12])); \
+ temp.sd = op(x.sd ,y.sd, &(((ptr_type*)&itemp)[13])); \
+ temp.se = op(x.se ,y.se, &(((ptr_type*)&itemp)[14])); \
+ temp.sf = op(x.sf ,y.sf, &(((ptr_type*)&itemp)[15])); \
+
+#define REMQUO_VECTOR_BODY(prim_type, num, op, ptr_type) \
+{ \
+ prim_type##num temp; \
+ ptr_type##num itemp; \
+ REMQUO_VECTOR_BODY_##num(op, ptr_type)\
+ *ptr = itemp; \
+ return temp; \
+} \
+
+
+_CLC_PROTECTED float remquof(float x, float y, int * ptr);
+_CLC_PROTECTED double builtin_remquo(double x, double y, int * ptr);
+
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, global int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, local int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float remquo(float x, float y, private int * ptr) REMQUO_SCALAR_BODY(float, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, global int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, local int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float2 remquo(float2 x, float2 y, private int2 * ptr) REMQUO_VECTOR_BODY(float, 2, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, global int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, local int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float3 remquo(float3 x, float3 y, private int3 * ptr) REMQUO_VECTOR_BODY(float, 3, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, global int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, local int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float4 remquo(float4 x, float4 y, private int4 * ptr) REMQUO_VECTOR_BODY(float, 4, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, global int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, local int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float8 remquo(float8 x, float8 y, private int8 * ptr) REMQUO_VECTOR_BODY(float, 8, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, global int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, local int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+_CLC_OVERLOAD _CLC_DEF float16 remquo(float16 x, float16 y, private int16 * ptr) REMQUO_VECTOR_BODY(float, 16, remquof, int)
+
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, global int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, local int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double remquo(double x, double y, private int * ptr) REMQUO_SCALAR_BODY(double, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, global int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, local int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double2 remquo(double2 x, double2 y, private int2 * ptr) REMQUO_VECTOR_BODY(double, 2, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, global int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, local int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double3 remquo(double3 x, double3 y, private int3 * ptr) REMQUO_VECTOR_BODY(double, 3, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, global int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, local int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double4 remquo(double4 x, double4 y, private int4 * ptr) REMQUO_VECTOR_BODY(double, 4, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, global int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, local int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double8 remquo(double8 x, double8 y, private int8 * ptr) REMQUO_VECTOR_BODY(double, 8, builtin_remquo, int)
+
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, global int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, local int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
+_CLC_OVERLOAD _CLC_DEF double16 remquo(double16 x, double16 y, private int16 * ptr) REMQUO_VECTOR_BODY(double, 16, builtin_remquo, int)
diff --git a/src/builtins/rotate.cl b/src/builtins/rotate.cl
new file mode 100644
index 0000000..fc894b0
--- /dev/null
+++ b/src/builtins/rotate.cl
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+/*-----------------------------------------------------------------------------
+* The template for non rotl applicable scalar types
+*----------------------------------------------------------------------------*/
+#define SCALAR(type, utype) \
+_CLC_OVERLOAD _CLC_DEF type rotate(type v, type i) \
+{\
+ uint bits = sizeof(v) << 3;\
+ uint mask = bits - 1; \
+ i &= mask; \
+ if (i == 0) return v; \
+ return (v << i) | ((utype)v >> (bits-i)); \
+}\
+
+SCALAR(uchar, uchar)
+SCALAR(char, uchar)
+SCALAR(ushort, ushort)
+SCALAR(short, ushort)
+SCALAR(ulong, ulong)
+SCALAR(long, ulong)
+SCALAR(int, uint)
+
+BINARY_VEC_DEF(char, char, rotate, rotate)
+BINARY_VEC_DEF(uchar, uchar, rotate, rotate)
+BINARY_VEC_DEF(short, short, rotate, rotate)
+BINARY_VEC_DEF(ushort, ushort,rotate, rotate)
+BINARY_VEC_DEF(int, int, rotate, rotate)
+BINARY_VEC_DEF(uint, uint, rotate, rotate)
+BINARY_VEC_DEF(long, long, rotate, rotate)
+BINARY_VEC_DEF(ulong, ulong, rotate, rotate)
diff --git a/src/builtins/select.cl b/src/builtins/select.cl
new file mode 100644
index 0000000..52a078c
--- /dev/null
+++ b/src/builtins/select.cl
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define DECLARATION(type, itype, utype) \
+_CLC_OVERLOAD _CLC_DEF type select(type a, type b, itype c) { return c ? b : a; }\
+_CLC_OVERLOAD _CLC_DEF type select(type a, type b, utype c) { return c ? b : a; }
+
+#define SELECT_EXPAND_SIZES(type,itype,utype) \
+ DECLARATION(_VEC_TYPE(type,2), _VEC_TYPE(itype,2), _VEC_TYPE(utype,2)) \
+ DECLARATION(_VEC_TYPE(type,3), _VEC_TYPE(itype,3), _VEC_TYPE(utype,3)) \
+ DECLARATION(_VEC_TYPE(type,4), _VEC_TYPE(itype,4), _VEC_TYPE(utype,4)) \
+ DECLARATION(_VEC_TYPE(type,8), _VEC_TYPE(itype,8), _VEC_TYPE(utype,8)) \
+ DECLARATION(_VEC_TYPE(type,16), _VEC_TYPE(itype,16), _VEC_TYPE(utype,16)) \
+
+#define SELECT_EXPAND_TYPES \
+ SELECT_EXPAND_SIZES(char, char, uchar) \
+ SELECT_EXPAND_SIZES(uchar, char, uchar) \
+ SELECT_EXPAND_SIZES(short, short, ushort) \
+ SELECT_EXPAND_SIZES(ushort, short, ushort) \
+ SELECT_EXPAND_SIZES(int, int, uint) \
+ SELECT_EXPAND_SIZES(uint, int, uint) \
+ SELECT_EXPAND_SIZES(long, long, ulong) \
+ SELECT_EXPAND_SIZES(ulong, long, ulong) \
+ SELECT_EXPAND_SIZES(float, int, uint) \
+ SELECT_EXPAND_SIZES(double, long, ulong)
+
+SELECT_EXPAND_TYPES
diff --git a/src/builtins/shuffle.cl b/src/builtins/shuffle.cl
new file mode 100644
index 0000000..3ec3b56
--- /dev/null
+++ b/src/builtins/shuffle.cl
@@ -0,0 +1,215 @@
+/******************************************************************************
+ * Copyright (c) 2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "clc.h"
+
+#define TEMPLATE2(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle(res_elemt##val_vnum val, mask_elemt##2 mask) \
+{ \
+ res_elemt##2 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ return result; \
+}\
+_CLC_OVERLOAD _CLC_DEF res_elemt##2 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##2 mask) \
+{ \
+ res_elemt##2 result; \
+ res_elemt *p1 = (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE4(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle(res_elemt##val_vnum val, mask_elemt##4 mask) \
+{ \
+ res_elemt##4 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##4 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##4 mask) \
+{ \
+ res_elemt##4 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE8(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle(res_elemt##val_vnum val, mask_elemt##8 mask) \
+{ \
+ res_elemt##8 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ result.s4 = p[mask.s4 & vec_step(val)-1]; \
+ result.s5 = p[mask.s5 & vec_step(val)-1]; \
+ result.s6 = p[mask.s6 & vec_step(val)-1]; \
+ result.s7 = p[mask.s7 & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##8 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##8 mask) \
+{ \
+ res_elemt##8 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \
+ p1[mask.s4 & vec_step(val1)-1]; \
+ result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \
+ p1[mask.s5 & vec_step(val1)-1]; \
+ result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \
+ p1[mask.s6 & vec_step(val1)-1]; \
+ result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \
+ p1[mask.s7 & vec_step(val1)-1]; \
+ return result; \
+}
+
+#define TEMPLATE16(res_elemt, val_vnum, mask_elemt) \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle(res_elemt##val_vnum val, mask_elemt##16 mask) \
+{ \
+ res_elemt##16 result; \
+ res_elemt *p = (res_elemt*)&val; \
+ result.s0 = p[mask.s0 & vec_step(val)-1]; \
+ result.s1 = p[mask.s1 & vec_step(val)-1]; \
+ result.s2 = p[mask.s2 & vec_step(val)-1]; \
+ result.s3 = p[mask.s3 & vec_step(val)-1]; \
+ result.s4 = p[mask.s4 & vec_step(val)-1]; \
+ result.s5 = p[mask.s5 & vec_step(val)-1]; \
+ result.s6 = p[mask.s6 & vec_step(val)-1]; \
+ result.s7 = p[mask.s7 & vec_step(val)-1]; \
+ result.s8 = p[mask.s8 & vec_step(val)-1]; \
+ result.s9 = p[mask.s9 & vec_step(val)-1]; \
+ result.sa = p[mask.sa & vec_step(val)-1]; \
+ result.sb = p[mask.sb & vec_step(val)-1]; \
+ result.sc = p[mask.sc & vec_step(val)-1]; \
+ result.sd = p[mask.sd & vec_step(val)-1]; \
+ result.se = p[mask.se & vec_step(val)-1]; \
+ result.sf = p[mask.sf & vec_step(val)-1]; \
+ return result; \
+} \
+_CLC_OVERLOAD _CLC_DEF res_elemt##16 shuffle2(res_elemt##val_vnum val1, res_elemt##val_vnum val2, mask_elemt##16 mask) \
+{ \
+ res_elemt##16 result; \
+ res_elemt *p1= (res_elemt*)&val1; \
+ res_elemt *p2 = (res_elemt*)&val2; \
+ result.s0 = mask.s0 & vec_step(val1) ? p2[mask.s0 & vec_step(val1)-1] : \
+ p1[mask.s0 & vec_step(val1)-1]; \
+ result.s1 = mask.s1 & vec_step(val1) ? p2[mask.s1 & vec_step(val1)-1] : \
+ p1[mask.s1 & vec_step(val1)-1]; \
+ result.s2 = mask.s2 & vec_step(val1) ? p2[mask.s2 & vec_step(val1)-1] : \
+ p1[mask.s2 & vec_step(val1)-1]; \
+ result.s3 = mask.s3 & vec_step(val1) ? p2[mask.s3 & vec_step(val1)-1] : \
+ p1[mask.s3 & vec_step(val1)-1]; \
+ result.s4 = mask.s4 & vec_step(val1) ? p2[mask.s4 & vec_step(val1)-1] : \
+ p1[mask.s4 & vec_step(val1)-1]; \
+ result.s5 = mask.s5 & vec_step(val1) ? p2[mask.s5 & vec_step(val1)-1] : \
+ p1[mask.s5 & vec_step(val1)-1]; \
+ result.s6 = mask.s6 & vec_step(val1) ? p2[mask.s6 & vec_step(val1)-1] : \
+ p1[mask.s6 & vec_step(val1)-1]; \
+ result.s7 = mask.s7 & vec_step(val1) ? p2[mask.s7 & vec_step(val1)-1] : \
+ p1[mask.s7 & vec_step(val1)-1]; \
+ result.s8 = mask.s8 & vec_step(val1) ? p2[mask.s8 & vec_step(val1)-1] : \
+ p1[mask.s8 & vec_step(val1)-1]; \
+ result.s9 = mask.s9 & vec_step(val1) ? p2[mask.s9 & vec_step(val1)-1] : \
+ p1[mask.s9 & vec_step(val1)-1]; \
+ result.sa = mask.sa & vec_step(val1) ? p2[mask.sa & vec_step(val1)-1] : \
+ p1[mask.sa & vec_step(val1)-1]; \
+ result.sb = mask.sb & vec_step(val1) ? p2[mask.sb & vec_step(val1)-1] : \
+ p1[mask.sb & vec_step(val1)-1]; \
+ result.sc = mask.sc & vec_step(val1) ? p2[mask.sc & vec_step(val1)-1] : \
+ p1[mask.sc & vec_step(val1)-1]; \
+ result.sd = mask.sd & vec_step(val1) ? p2[mask.sd & vec_step(val1)-1] : \
+ p1[mask.sd & vec_step(val1)-1]; \
+ result.se = mask.se & vec_step(val1) ? p2[mask.se & vec_step(val1)-1] : \
+ p1[mask.se & vec_step(val1)-1]; \
+ result.sf = mask.sf & vec_step(val1) ? p2[mask.sf & vec_step(val1)-1] : \
+ p1[mask.sf & vec_step(val1)-1]; \
+ return result; \
+}
+
+
+#define CROSS_SIZE(type1, type2) \
+TEMPLATE2(type1, 2, type2) \
+TEMPLATE2(type1, 4, type2) \
+TEMPLATE2(type1, 8, type2) \
+TEMPLATE2(type1, 16, type2) \
+TEMPLATE4(type1, 2, type2) \
+TEMPLATE4(type1, 4, type2) \
+TEMPLATE4(type1, 8, type2) \
+TEMPLATE4(type1, 16, type2) \
+TEMPLATE8(type1, 2, type2) \
+TEMPLATE8(type1, 4, type2) \
+TEMPLATE8(type1, 8, type2) \
+TEMPLATE8(type1, 16, type2) \
+TEMPLATE16(type1, 2, type2) \
+TEMPLATE16(type1, 4, type2) \
+TEMPLATE16(type1, 8, type2) \
+TEMPLATE16(type1, 16, type2) \
+
+#define CROSS_MASKTYPE(type) \
+CROSS_SIZE(type, uchar) \
+CROSS_SIZE(type, ushort) \
+CROSS_SIZE(type, uint) \
+CROSS_SIZE(type, ulong) \
+
+CROSS_MASKTYPE(char)
+CROSS_MASKTYPE(uchar)
+CROSS_MASKTYPE(short)
+CROSS_MASKTYPE(ushort)
+CROSS_MASKTYPE(int)
+CROSS_MASKTYPE(uint)
+CROSS_MASKTYPE(long)
+CROSS_MASKTYPE(ulong)
+CROSS_MASKTYPE(float)
+CROSS_MASKTYPE(double)
diff --git a/src/builtins/sign.cl b/src/builtins/sign.cl
new file mode 100644
index 0000000..e440f2f
--- /dev/null
+++ b/src/builtins/sign.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION (_VEC_TYPE(type,3)) \
+ IMPLEMENTATION (_VEC_TYPE(type,4)) \
+ IMPLEMENTATION (_VEC_TYPE(type,8)) \
+ IMPLEMENTATION (_VEC_TYPE(type,16)) \
+
+#define IMPLEMENTATION(gentype) \
+_CLC_OVERLOAD _CLC_DEF gentype sign(gentype x) \
+{ return x > (gentype)0.0 ? (gentype) 1.0 : \
+ x < (gentype)0.0 ? (gentype)-1.0 : \
+ isnan(x) ? (gentype) 0.0 : x; }\
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/sincos.cl b/src/builtins/sincos.cl
new file mode 100644
index 0000000..1552f6b
--- /dev/null
+++ b/src/builtins/sincos.cl
@@ -0,0 +1,128 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_PROTECTED void sincosf(float x, float * sinval, float * cosval);
+_CLC_PROTECTED void builtin_sincos(double x, double * sinval, double * cosval);
+
+#define SINCOS_SCALAR_BODY(type, op) \
+{ \
+ type sin_val; \
+ type cos_val; \
+ op(x, &sin_val, &cos_val); \
+ *cosval = cos_val; \
+ return sin_val; \
+} \
+
+#define SINCOS_VECTOR_BODY_2(prim_type, op) \
+ op(x.s0, &(((prim_type*)&sin_val)[0]), &(((prim_type*)&cos_val)[0])); \
+ op(x.s1, &(((prim_type*)&sin_val)[1]), &(((prim_type*)&cos_val)[1])); \
+
+#define SINCOS_VECTOR_BODY_3(prim_type, op) \
+ SINCOS_VECTOR_BODY_2(prim_type, op) \
+ op(x.s2, &(((prim_type*)&sin_val)[2]), &(((prim_type*)&cos_val)[2])); \
+
+#define SINCOS_VECTOR_BODY_4(prim_type, op) \
+ SINCOS_VECTOR_BODY_3(prim_type, op) \
+ op(x.s3, &(((prim_type*)&sin_val)[3]), &(((prim_type*)&cos_val)[3])); \
+
+#define SINCOS_VECTOR_BODY_8(prim_type, op) \
+ SINCOS_VECTOR_BODY_4(prim_type, op) \
+ op(x.s4, &(((prim_type*)&sin_val)[4]), &(((prim_type*)&cos_val)[4])); \
+ op(x.s5, &(((prim_type*)&sin_val)[5]), &(((prim_type*)&cos_val)[5])); \
+ op(x.s6, &(((prim_type*)&sin_val)[6]), &(((prim_type*)&cos_val)[6])); \
+ op(x.s7, &(((prim_type*)&sin_val)[7]), &(((prim_type*)&cos_val)[7])); \
+
+#define SINCOS_VECTOR_BODY_16(prim_type, op) \
+ SINCOS_VECTOR_BODY_8(prim_type, op) \
+ op(x.s8, &(((prim_type*)&sin_val)[8]), &(((prim_type*)&cos_val)[8])); \
+ op(x.s9, &(((prim_type*)&sin_val)[9]), &(((prim_type*)&cos_val)[9])); \
+ op(x.sa, &(((prim_type*)&sin_val)[10]), &(((prim_type*)&cos_val)[10])); \
+ op(x.sb, &(((prim_type*)&sin_val)[11]), &(((prim_type*)&cos_val)[11])); \
+ op(x.sc, &(((prim_type*)&sin_val)[12]), &(((prim_type*)&cos_val)[12])); \
+ op(x.sd, &(((prim_type*)&sin_val)[13]), &(((prim_type*)&cos_val)[13])); \
+ op(x.se, &(((prim_type*)&sin_val)[14]), &(((prim_type*)&cos_val)[14])); \
+ op(x.sf, &(((prim_type*)&sin_val)[15]), &(((prim_type*)&cos_val)[15])); \
+
+#define SINCOS_VECTOR_BODY(prim_type, num, op) \
+{ \
+ prim_type##num sin_val; \
+ prim_type##num cos_val; \
+ SINCOS_VECTOR_BODY_##num(prim_type, op)\
+ *cosval = cos_val; \
+ return sin_val; \
+} \
+
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, global float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, local float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+_CLC_OVERLOAD _CLC_INLINE float sincos(float x, private float * cosval) SINCOS_SCALAR_BODY(float, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, global float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, local float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+_CLC_OVERLOAD _CLC_DEF float2 sincos(float2 x, private float2 * cosval) SINCOS_VECTOR_BODY(float, 2, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, global float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, local float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+_CLC_OVERLOAD _CLC_DEF float3 sincos(float3 x, private float3 * cosval) SINCOS_VECTOR_BODY(float, 3, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, global float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, local float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+_CLC_OVERLOAD _CLC_DEF float4 sincos(float4 x, private float4 * cosval) SINCOS_VECTOR_BODY(float, 4, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, global float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, local float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+_CLC_OVERLOAD _CLC_DEF float8 sincos(float8 x, private float8 * cosval) SINCOS_VECTOR_BODY(float, 8, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, global float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, local float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+_CLC_OVERLOAD _CLC_DEF float16 sincos(float16 x, private float16 * cosval) SINCOS_VECTOR_BODY(float, 16, sincosf)
+
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, global double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, local double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double sincos(double x, private double * cosval) SINCOS_SCALAR_BODY(double, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, global double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, local double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double2 sincos(double2 x, private double2 * cosval) SINCOS_VECTOR_BODY(double, 2, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, global double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, local double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double3 sincos(double3 x, private double3 * cosval) SINCOS_VECTOR_BODY(double, 3, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, global double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, local double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double4 sincos(double4 x, private double4 * cosval) SINCOS_VECTOR_BODY(double, 4, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, global double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, local double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double8 sincos(double8 x, private double8 * cosval) SINCOS_VECTOR_BODY(double, 8, builtin_sincos)
+
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, global double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, local double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+_CLC_OVERLOAD _CLC_DEF double16 sincos(double16 x, private double16 * cosval) SINCOS_VECTOR_BODY(double, 16, builtin_sincos)
+
diff --git a/src/builtins/smoothstep.cl b/src/builtins/smoothstep.cl
new file mode 100644
index 0000000..96e3d2a
--- /dev/null
+++ b/src/builtins/smoothstep.cl
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+_CLC_OVERLOAD _CLC_DEF float smoothstep(float edge0, float edge1, float x)
+{
+ float t = clamp((float)((x-edge0)/(edge1-edge0)), 0.0f, 1.0f);
+ return t * t * (3.0f - 2.0f*t);
+}
+
+_CLC_OVERLOAD _CLC_DEF double smoothstep(double edge0, double edge1, double x)
+{
+ double t = clamp((double)((x-edge0)/(edge1-edge0)), 0.0, 1.0);
+ return t * t * (3.0 - 2.0*t);
+}
+
+#define FLOAT_TEMPLATE(N) \
+_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float##N edge0, float##N edge1, float##N x) \
+{\
+ float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \
+ return t*t*(3.0f - 2.0f * t); \
+}\
+_CLC_OVERLOAD _CLC_DEF float##N smoothstep(float edge0, float edge1, float##N x) \
+{\
+ float##N t = clamp((x-edge0)/(edge1-edge0), 0.0f, 1.0f); \
+ return t*t*(3.0f - 2.0f * t);\
+}\
+
+
+#define DOUBLE_TEMPLATE(N) \
+_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double##N edge0, double##N edge1, double##N x) \
+{\
+ double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \
+ return t*t*(3.0 - 2.0 * t);\
+}\
+_CLC_OVERLOAD _CLC_DEF double##N smoothstep(double edge0, double edge1, double##N x) \
+{\
+ double##N t = clamp((x-edge0)/(edge1-edge0), 0.0, 1.0); \
+ return t*t*(3.0 - 2.0 * t);\
+}
+
+FLOAT_TEMPLATE(2)
+FLOAT_TEMPLATE(3)
+FLOAT_TEMPLATE(4)
+FLOAT_TEMPLATE(8)
+FLOAT_TEMPLATE(16)
+
+DOUBLE_TEMPLATE(2)
+DOUBLE_TEMPLATE(3)
+DOUBLE_TEMPLATE(4)
+DOUBLE_TEMPLATE(8)
+DOUBLE_TEMPLATE(16)
diff --git a/src/builtins/step.cl b/src/builtins/step.cl
new file mode 100644
index 0000000..daecefd
--- /dev/null
+++ b/src/builtins/step.cl
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define EXPAND_SIZES(type) \
+ IMPLEMENTATION(_VEC_TYPE(type,3), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,4), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,8), type) \
+ IMPLEMENTATION(_VEC_TYPE(type,16), type) \
+
+#define IMPLEMENTATION(gentype, sgentype) \
+_CLC_OVERLOAD _CLC_DEF gentype step(gentype edge, gentype x) \
+ { return x < edge ? (gentype)0.0 : (gentype)1.0 ; } \
+_CLC_OVERLOAD _CLC_DEF gentype step(sgentype edge, gentype x) \
+ { return x < (gentype)edge ? (gentype)0.0 : (gentype)1.0 ; } \
+
+EXPAND_SIZES(float)
+EXPAND_SIZES(double)
diff --git a/src/builtins/sub_sat.cl b/src/builtins/sub_sat.cl
new file mode 100644
index 0000000..78442f0
--- /dev/null
+++ b/src/builtins/sub_sat.cl
@@ -0,0 +1,37 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+BINARY_VEC_DEF(char, char, sub_sat, sub_sat)
+BINARY_VEC_DEF(uchar, uchar, sub_sat, sub_sat)
+BINARY_VEC_DEF(short, short, sub_sat, sub_sat)
+BINARY_VEC_DEF(ushort, ushort,sub_sat, sub_sat)
+BINARY_VEC_DEF(int, int, sub_sat, sub_sat)
+BINARY_VEC_DEF(uint, uint, sub_sat, sub_sat)
+BINARY_VEC_DEF(long, long, sub_sat, sub_sat)
+BINARY_VEC_DEF(ulong, ulong, sub_sat, sub_sat)
diff --git a/src/builtins/upsample.cl b/src/builtins/upsample.cl
new file mode 100644
index 0000000..8415a33
--- /dev/null
+++ b/src/builtins/upsample.cl
@@ -0,0 +1,56 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cpu.h"
+
+/*-----------------------------------------------------------------------------
+* Expand vector type implementations
+*----------------------------------------------------------------------------*/
+#define TEMPLATE(xtype,ytype,restype) \
+_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \
+{ return (restype)(upsample(x.lo,y.lo), upsample(x.hi,y.hi)); }
+
+#define TEMPLATE3(xtype,ytype,restype) \
+_CLC_OVERLOAD _CLC_DEF restype upsample(xtype x, ytype y) \
+{ return (restype)(upsample(x.s0,y.s0), upsample(x.s1,y.s1), upsample(x.s2,y.s2)); }
+
+#define EXPAND_SIZES(xtype, ytype, restype)\
+ TEMPLATE(_VEC_TYPE(xtype,2), _VEC_TYPE(ytype,2), _VEC_TYPE(restype,2))\
+ TEMPLATE3(_VEC_TYPE(xtype,3), _VEC_TYPE(ytype,3), _VEC_TYPE(restype,3))\
+ TEMPLATE(_VEC_TYPE(xtype,4), _VEC_TYPE(ytype,4), _VEC_TYPE(restype,4))\
+ TEMPLATE(_VEC_TYPE(xtype,8), _VEC_TYPE(ytype,8), _VEC_TYPE(restype,8))\
+ TEMPLATE(_VEC_TYPE(xtype,16), _VEC_TYPE(ytype,16), _VEC_TYPE(restype,16))\
+
+#define _EXPAND_UPSAMPLE_TYPES() \
+ EXPAND_SIZES(char, uchar, short) \
+ EXPAND_SIZES(uchar, uchar, ushort) \
+ EXPAND_SIZES(short, ushort, int) \
+ EXPAND_SIZES(ushort, ushort, uint) \
+ EXPAND_SIZES(int, uint, long) \
+ EXPAND_SIZES(uint, uint, ulong) \
+
+_EXPAND_UPSAMPLE_TYPES()
diff --git a/src/builtins/vload.cl b/src/builtins/vload.cl
new file mode 100644
index 0000000..2cd9a3a
--- /dev/null
+++ b/src/builtins/vload.cl
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Peter Collingbourne <peter@pcc.me.uk>
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "clc.h"
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##4)(x[(offset<<2)], x[1+(offset<<2)], x[2+(offset<<2)], x[3+(offset<<2)]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##8)(x[(offset<<3)], x[1+(offset<<3)], x[2+(offset<<3)], x[3+(offset<<3)],\
+ x[4+(offset<<3)], x[5+(offset<<3)], x[6+(offset<<3)], x[7+(offset<<3)]); \
+ } \
+ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
+ return (PRIM_TYPE##16)(x[(offset<<4)], x[1+(offset<<4)], x[2+(offset<<4)], x[3+(offset<<4)],\
+ x[4+(offset<<4)], x[5+(offset<<4)], x[6+(offset<<4)], x[7+(offset<<4)], \
+ x[8+(offset<<4)], x[9+(offset<<4)], x[10+(offset<<4)], x[11+(offset<<4)], \
+ x[12+(offset<<4)], x[13+(offset<<4)], x[14+(offset<<4)], x[15+(offset<<4)]); \
+ } \
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \
+ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \
+
+#define VLOAD_TYPES() \
+ VLOAD_ADDR_SPACES(char) \
+ VLOAD_ADDR_SPACES(uchar) \
+ VLOAD_ADDR_SPACES(short) \
+ VLOAD_ADDR_SPACES(ushort) \
+ VLOAD_ADDR_SPACES(int) \
+ VLOAD_ADDR_SPACES(uint) \
+ VLOAD_ADDR_SPACES(long) \
+ VLOAD_ADDR_SPACES(ulong) \
+ VLOAD_ADDR_SPACES(float) \
+ VLOAD_ADDR_SPACES(double)\
+
+VLOAD_TYPES()
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
+ _CLC_OVERLOAD _CLC_INLINE void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[3*offset] = vec.s0; \
+ mem[(3*offset)+1] = vec.s1; \
+ mem[(3*offset)+2] = vec.s2; \
+ } \
+ _CLC_OVERLOAD _CLC_INLINE void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[offset<<2] = vec.s0; \
+ mem[1+(offset<<2)] = vec.s1; \
+ mem[2+(offset<<2)] = vec.s2; \
+ mem[3+(offset<<2)] = vec.s3; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[(offset<<3)] = vec.s0; \
+ mem[1+(offset<<3)] = vec.s1; \
+ mem[2+(offset<<3)] = vec.s2; \
+ mem[3+(offset<<3)] = vec.s3; \
+ mem[4+(offset<<3)] = vec.s4; \
+ mem[5+(offset<<3)] = vec.s5; \
+ mem[6+(offset<<3)] = vec.s6; \
+ mem[7+(offset<<3)] = vec.s7; \
+ } \
+ _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
+ mem[(offset<<4)] = vec.s0; \
+ mem[1+(offset<<4)] = vec.s1; \
+ mem[2+(offset<<4)] = vec.s2; \
+ mem[3+(offset<<4)] = vec.s3; \
+ mem[4+(offset<<4)] = vec.s4; \
+ mem[5+(offset<<4)] = vec.s5; \
+ mem[6+(offset<<4)] = vec.s6; \
+ mem[7+(offset<<4)] = vec.s7; \
+ mem[8+(offset<<4)] = vec.s8; \
+ mem[9+(offset<<4)] = vec.s9; \
+ mem[10+(offset<<4)] = vec.sa; \
+ mem[11+(offset<<4)] = vec.sb; \
+ mem[12+(offset<<4)] = vec.sc; \
+ mem[13+(offset<<4)] = vec.sd; \
+ mem[14+(offset<<4)] = vec.se; \
+ mem[15+(offset<<4)] = vec.sf; \
+ } \
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
+ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
+
+#define VSTORE_TYPES() \
+ VSTORE_ADDR_SPACES(char) \
+ VSTORE_ADDR_SPACES(uchar) \
+ VSTORE_ADDR_SPACES(short) \
+ VSTORE_ADDR_SPACES(ushort) \
+ VSTORE_ADDR_SPACES(int) \
+ VSTORE_ADDR_SPACES(uint) \
+ VSTORE_ADDR_SPACES(long) \
+ VSTORE_ADDR_SPACES(ulong) \
+ VSTORE_ADDR_SPACES(float) \
+ VSTORE_ADDR_SPACES(double) \
+
+VSTORE_TYPES()
diff --git a/src/core/commandqueue.cpp b/src/core/commandqueue.cpp
new file mode 100644
index 0000000..662dad1
--- /dev/null
+++ b/src/core/commandqueue.cpp
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file commandqueue.cpp
+ * \brief Command queue
+ */
+
+#include "commandqueue.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "events.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include <stdio.h>
+
+using namespace Coal;
+
+#define OOO_QUEUE_PUSH_EVENTS_THRESHOLD 64
+
+/******************************************************************************
+* CommandQueue::CommandQueue
+******************************************************************************/
+CommandQueue::CommandQueue(Context *ctx,
+ DeviceInterface *device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret)
+: Object(Object::T_CommandQueue, ctx), p_device(device),
+ p_num_events_in_queue(0), p_num_events_on_device(0),
+ p_num_events_completed(0),
+ p_properties(properties), p_flushed(true)
+{
+ // Initialize the locking machinery
+ pthread_mutex_init(&p_event_list_mutex, 0);
+ pthread_cond_init(&p_event_list_cond, 0);
+
+ // Check that the device belongs to the context
+ if (!ctx->hasDevice(device))
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return;
+ }
+ p_device->init();
+
+ *errcode_ret = checkProperties();
+}
+
+/******************************************************************************
+* CommandQueue::~CommandQueue()
+******************************************************************************/
+CommandQueue::~CommandQueue()
+{
+ cleanReleasedEvents();
+ // Free the mutex
+ pthread_mutex_destroy(&p_event_list_mutex);
+ pthread_cond_destroy(&p_event_list_cond);
+}
+
+/******************************************************************************
+* cl_int CommandQueue::info
+******************************************************************************/
+cl_int CommandQueue::info(cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_device_id cl_device_id_var;
+ cl_context cl_context_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_QUEUE_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_QUEUE_DEVICE:
+ SIMPLE_ASSIGN(cl_device_id, p_device);
+ break;
+
+ case CL_QUEUE_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties, p_properties);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* cl_int CommandQueue::setProperty
+******************************************************************************/
+cl_int CommandQueue::setProperty(cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties *old_properties)
+{
+ if (old_properties)
+ *old_properties = p_properties;
+
+ if (enable)
+ p_properties |= properties;
+ else
+ p_properties &= ~properties;
+
+ return checkProperties();
+}
+
+/******************************************************************************
+* cl_int CommandQueue::checkProperties
+******************************************************************************/
+cl_int CommandQueue::checkProperties() const
+{
+ // Check that all the properties are valid
+ cl_command_queue_properties properties =
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE;
+
+ if ((p_properties & properties) != p_properties)
+ return CL_INVALID_VALUE;
+
+ // Check that the device handles these properties
+ cl_int result;
+
+ result = p_device->info(CL_DEVICE_QUEUE_PROPERTIES,
+ sizeof(cl_command_queue_properties),
+ &properties,
+ 0);
+
+ if (result != CL_SUCCESS)
+ return result;
+
+ if ((p_properties & properties) != p_properties)
+ return CL_INVALID_QUEUE_PROPERTIES;
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void CommandQueue::flush()
+******************************************************************************/
+void CommandQueue::flush()
+{
+ // Wait for the command queue to be in state "flushed".
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (!p_flushed)
+ pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ cleanReleasedEvents();
+}
+
+/******************************************************************************
+* void CommandQueue::finish()
+******************************************************************************/
+void CommandQueue::finish()
+{
+ // As pushEventsOnDevice doesn't remove SUCCESS events, we may need
+ // to do that here in order not to be stuck.
+ cleanEvents();
+
+ // All the queued events must have completed. When they are, they get
+ // deleted from the command queue, so simply wait for it to become empty.
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (p_num_events_in_queue != 0)
+ pthread_cond_wait(&p_event_list_cond, &p_event_list_mutex);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ cleanReleasedEvents();
+}
+
+/******************************************************************************
+* cl_int CommandQueue::queueEvent(Event *event)
+******************************************************************************/
+cl_int CommandQueue::queueEvent(Event *event)
+{
+ // Let the device initialize the event (for instance, a pointer at which
+ // memory would be mapped)
+ cl_int rs = p_device->initEventDeviceData(event);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ // Append the event at the end of the list
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ p_events.push_back(event);
+ p_num_events_in_queue += 1;
+ p_flushed = false;
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ // Timing info if needed
+ if (p_properties & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Queue);
+
+ // Explore the list for events we can push on the device
+ pushEventsOnDevice();
+
+ cleanReleasedEvents();
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void CommandQueue::releaseEvent()
+******************************************************************************/
+void CommandQueue::releaseEvent(Event *e)
+{
+ pthread_mutex_lock(&p_event_list_mutex);
+ p_released_events.push_back(e);
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* void CommandQueue::cleanEvents()
+******************************************************************************/
+void CommandQueue::cleanEvents()
+{
+ bool is_inorder =
+ (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ // No need to cleanEvents() every time an event finishes, so that we can
+ // save on the event traversal time. 16 is a number that can be tuned
+ // (e.g. using ooo example).
+ if (p_num_events_completed < 16 && p_num_events_on_device > 0 &&
+ p_num_events_in_queue - p_num_events_completed > 0)
+ {
+ pthread_mutex_unlock(&p_event_list_mutex);
+ return;
+ }
+
+ std::list<Event *>::iterator it = p_events.begin(), oldit;
+
+ while (it != p_events.end())
+ {
+ Event *event = *it;
+
+ if (event->status() == Event::Complete)
+ {
+ // We cannot be deleted from inside us
+ event->setReleaseParent(false);
+ oldit = it;
+ ++it;
+
+ p_num_events_in_queue -= 1;
+ p_num_events_completed -= 1;
+ p_events.erase(oldit);
+ // put Completed events into another list
+ // let main thread release/delete them
+ p_released_events.push_back(event);
+ }
+ else if (is_inorder)
+ {
+ // In Order Queue events are dispatched and completed in Order
+ break;
+ }
+ else
+ {
+ ++it;
+ }
+ }
+
+ // We have cleared the list, so wake up the sleeping threads
+ if (p_num_events_in_queue == 0)
+ pthread_cond_broadcast(&p_event_list_cond);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ // Check now if we have to be deleted
+ if (references() == 0)
+ {
+ delete this;
+ }
+}
+
+/******************************************************************************
+* void CommandQueue::cleanReleasedEvents()
+* !!! Can only be called by the main thread!!! new/delete, malloc/free are not
+* thread safe on ARM, so let main thread handle them SOLELY!
+******************************************************************************/
+void CommandQueue::cleanReleasedEvents()
+{
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ while (! p_released_events.empty())
+ {
+ Event *event = p_released_events.front();
+ clReleaseEvent((cl_event)event);
+ p_released_events.pop_front();
+ }
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* void CommandQueue::pushEventsOnDevice()
+* Who is calling this function:
+* (ready_event, one_event_completed_on_device)
+* (not NULL, * ): worker thread, push till this one ready event
+* ( NULL, true ): worker thread, one completes, push rest on this queue
+* ( NULL, false): main thread, queued a new event, push this queue
+******************************************************************************/
+void CommandQueue::pushEventsOnDevice(Event *ready_event,
+ bool one_event_completed_on_device)
+{
+ int non_complete_events_traversed = 0;
+ bool is_ooo = (p_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
+ bool do_profile = (p_properties & CL_QUEUE_PROFILING_ENABLE) != 0;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ if (one_event_completed_on_device)
+ {
+ p_num_events_on_device -= 1;
+ p_num_events_completed += 1;
+ }
+
+ // No need to push more events on Device if 1) device has already got
+ // enough to work on, and 2) not pushing won't cause starvation of this
+ // commandqueue. Not pushing can save p_event_list traversal time.
+ // 2 is a QoS number, set to 2 for the time being
+ // imagaine there are multiple commandqueues on same device
+ if(is_ooo && ready_event == NULL &&
+ p_num_events_on_device > 2 && p_device->gotEnoughToWorkOn())
+ {
+ pthread_mutex_unlock(&p_event_list_mutex);
+ return;
+ }
+
+ // Explore the events in p_events and push on the device all of them that
+ // are :
+ //
+ // - Not already pushed (in Event::Queued state)
+ // - Not after a barrier, except if we begin with a barrier
+ // - If we are in-order, only the first event in Event::Queued state can
+ // be pushed
+
+ std::list<Event *>::iterator it = p_events.begin();
+ std::list<Event *>::iterator oldit;
+ bool first = true;
+
+ // We assume that we will flush the command queue (submit all the events)
+ // This will be changed in the while() when we know that not all events
+ // are submitted.
+ p_flushed = true;
+
+ while (it != p_events.end())
+ {
+ Event *event = *it;
+
+ // If the event is completed, remove it
+ if (event->status() == Event::Complete)
+ {
+ event->setReleaseParent(false);
+ oldit = it;
+ ++it;
+
+ p_num_events_completed -= 1;
+ p_num_events_in_queue -= 1;
+ p_events.erase(oldit);
+ // put Completed events into another list
+ // let main thread release/delete them
+ p_released_events.push_back(event);
+ continue;
+ }
+
+ // If OOO queue threshold is met, skip examining the rest of events
+ if(ready_event == NULL &&
+ non_complete_events_traversed > OOO_QUEUE_PUSH_EVENTS_THRESHOLD)
+ break;
+ non_complete_events_traversed += 1;
+
+ // We cannot do out-of-order, so we can only push the first event.
+ if (!is_ooo && !first)
+ {
+ p_flushed = false; // There are remaining events.
+ break;
+ }
+
+ // Stop if we encounter a barrier that isn't the first event in the list.
+ if (event->type() == Event::Barrier && !first)
+ {
+ // We have events to wait, stop
+ p_flushed = false;
+ break;
+ }
+
+ // Completed events and first barriers are out, it remains real events
+ // that have to block in-order execution.
+ first = false;
+
+ // If the event is not "pushable" (in Event::Queued state), skip it
+ // It is either Submitted or Running.
+ if (event->status() != Event::Queued)
+ {
+ // Intended event is scheduled, skip the rest in queue
+ if (event == ready_event) break;
+
+ ++it;
+ continue;
+ }
+
+ // Check that all the waiting-on events of this event are finished
+ if (! event->waitEventsAllCompleted())
+ {
+ p_flushed = false;
+ // If we encounter a WaitForEvents event that is not "finished",
+ // don't push events after it.
+ if (event->type() == Event::WaitForEvents)
+ break;
+
+ // The event has its dependencies not already met.
+ ++it;
+ continue;
+ }
+
+ if (event->isInstantaneous())
+ {
+ // Set the event as completed. This will call pushEventsOnDevice,
+ // again, so release the lock to avoid a deadlock. We also return
+ // because the recursive call will continue our work.
+ pthread_mutex_unlock(&p_event_list_mutex);
+ event->setStatus(Event::Complete);
+ return;
+ }
+
+ // The event can be pushed, if we need to
+ if (do_profile) event->updateTiming(Event::Submit);
+
+ event->setStatus(Event::Submitted);
+ p_num_events_on_device += 1;
+ p_device->pushEvent(event);
+ }
+
+ if (ready_event != NULL && p_flushed)
+ p_flushed = (p_num_events_in_queue == 0);
+
+ if (p_flushed)
+ pthread_cond_broadcast(&p_event_list_cond);
+
+ pthread_mutex_unlock(&p_event_list_mutex);
+}
+
+/******************************************************************************
+* Event **CommandQueue::events(unsigned int &count)
+******************************************************************************/
+Event **CommandQueue::events(unsigned int &count,
+ bool include_completed_events)
+{
+ Event **result = NULL;
+
+ pthread_mutex_lock(&p_event_list_mutex);
+
+ count = p_num_events_in_queue;
+ if (count > 0)
+ result = (Event **)std::malloc(count * sizeof(Event *));
+
+ // Copy each event of the list into result, retaining them
+ unsigned int index = 0;
+ std::list<Event *>::iterator it = p_events.begin();
+
+ while (it != p_events.end())
+ {
+ if (! include_completed_events)
+ {
+ Event *e = *it;
+ if (e->status() == Event::Complete)
+ {
+ ++it;
+ continue;
+ }
+ }
+
+ result[index] = *it;
+ result[index]->reference();
+
+ ++it;
+ ++index;
+ }
+ count = index;
+
+ // Now result contains an immutable list of events. Even if the events
+ // become completed in another thread while result is used, the events
+ // are retained and so guaranteed to remain valid.
+ pthread_mutex_unlock(&p_event_list_mutex);
+
+ return result;
+}
+
+/******************************************************************************
+* Event::Event
+******************************************************************************/
+Event::Event(CommandQueue *parent,
+ Status status,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Object(Object::T_Event, parent),
+ p_status(status), p_device_data(0)
+{
+ // Initialize the locking machinery
+ pthread_cond_init(&p_state_change_cond, 0);
+ pthread_mutex_init(&p_state_mutex, 0);
+
+ std::memset(&p_timing, 0, sizeof(p_timing));
+
+ // Check sanity of parameters
+ if (!event_wait_list && num_events_in_wait_list)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+
+ if (event_wait_list && !num_events_in_wait_list)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+
+ // Check that none of the events in event_wait_list is in an error state
+ for (cl_uint i=0; i<num_events_in_wait_list; ++i)
+ {
+ if (event_wait_list[i] == 0)
+ {
+ *errcode_ret = CL_INVALID_EVENT_WAIT_LIST;
+ return;
+ }
+ else if (event_wait_list[i]->status() < 0)
+ {
+ *errcode_ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ return;
+ }
+ }
+
+ if (parent && num_events_in_wait_list > 0)
+ {
+ pthread_mutex_lock(&p_state_mutex);
+ for (cl_uint i=0; i<num_events_in_wait_list; ++i)
+ {
+ // if event_wait_list[i] is already COMPLETE, don't add it!!!
+ if (event_wait_list[i]->addDependentEvent(this))
+ p_wait_events.push_back(event_wait_list[i]);
+ }
+ pthread_mutex_unlock(&p_state_mutex);
+ }
+}
+
+/******************************************************************************
+* void Event::freeDeviceData()
+******************************************************************************/
+void Event::freeDeviceData()
+{
+ if (parent() && p_device_data)
+ {
+ DeviceInterface *device = 0;
+ ((CommandQueue *)parent())->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), &device, 0);
+
+ device->freeEventDeviceData(this);
+ }
+}
+
+/******************************************************************************
+* Event::~Event()
+******************************************************************************/
+Event::~Event()
+{
+ pthread_mutex_destroy(&p_state_mutex);
+ pthread_cond_destroy(&p_state_change_cond);
+}
+
+/******************************************************************************
+* bool Event::isInstantaneous()
+******************************************************************************/
+bool Event::isInstantaneous() const
+{
+ // A dummy event has nothing to do on an execution device and must be
+ // completed directly after being "submitted".
+
+ switch (type())
+ {
+ case Marker:
+ case User:
+ case Barrier:
+ case WaitForEvents:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/******************************************************************************
+* void Event::setStatus
+******************************************************************************/
+int Event::setStatusHelper(Status status)
+{
+ int num_dependent_events;
+
+ // TODO: If status < 0, terminate all the events depending on us.
+ pthread_mutex_lock(&p_state_mutex);
+ p_status = status;
+ num_dependent_events = p_dependent_events.size();
+
+ pthread_cond_broadcast(&p_state_change_cond);
+
+ // Call the callbacks
+ std::multimap<Status, CallbackData>::const_iterator it;
+ std::pair<std::multimap<Status, CallbackData>::const_iterator,
+ std::multimap<Status, CallbackData>::const_iterator> ret;
+
+ ret = p_callbacks.equal_range(status > 0 ? status : Complete);
+
+ for (it=ret.first; it!=ret.second; ++it)
+ {
+ const CallbackData &data = (*it).second;
+ data.callback((cl_event)this, p_status, data.user_data);
+ }
+
+ pthread_mutex_unlock(&p_state_mutex);
+
+ return num_dependent_events;
+}
+
+void Event::setStatus(Status status)
+{
+ if (type() == Event::User || (parent() && status == Complete))
+ {
+ CommandQueue *cq = (CommandQueue *) parent();
+
+ int num_dependent_events = setStatusHelper(status);
+ /*---------------------------------------------------------------------
+ * From this point on, the event could be dereferenced to 0 and deleted!
+ * Thus we cannot call flushQueues(). Need to save these queues.
+ *--------------------------------------------------------------------*/
+
+ /*---------------------------------------------------------------------
+ * Notify dependent events, remove dependence, and push them if possible
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < num_dependent_events; i += 1)
+ {
+ Event *d_event = p_dependent_events[i];
+ CommandQueue *q = (CommandQueue *) d_event->parent();
+ if (d_event->removeWaitEvent(this) && q != NULL) // order!
+ {
+ q->pushEventsOnDevice(d_event, (cq == q));
+ if (cq == q) cq = NULL;
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Inform our parent to push other events to the device if haven't done
+ * so already. UserEvent's parent is NULL.
+ *--------------------------------------------------------------------*/
+ if (cq != NULL) cq->pushEventsOnDevice(NULL, true);
+ }
+ else
+ setStatusHelper(status);
+}
+
+bool Event::addDependentEvent(Event *event)
+{
+ pthread_mutex_lock(&p_state_mutex);
+ if (p_status == Event::Complete)
+ {
+ pthread_mutex_unlock(&p_state_mutex);
+ return false;
+ }
+
+ p_dependent_events.push_back(event);
+ Object::reference(); // retain this event
+ pthread_mutex_unlock(&p_state_mutex);
+ return true;
+}
+
+bool Event::removeWaitEvent(Event *event)
+{
+ bool empty;
+
+ pthread_mutex_lock(&p_state_mutex);
+ p_wait_events.remove(event);
+ empty = p_wait_events.empty();
+ pthread_mutex_unlock(&p_state_mutex);
+
+ CommandQueue *q = (CommandQueue *) event->parent();
+ if (q != NULL) q->releaseEvent(event);
+ return empty;
+}
+
+bool Event::waitEventsAllCompleted()
+{
+// YUAN TODO: p_wait_events is always shrinking, is lock necessary?
+// it is a little bit faster without having to lock!!!
+#if 1
+ bool empty;
+
+ pthread_mutex_lock(&p_state_mutex);
+ empty = p_wait_events.empty();
+ pthread_mutex_unlock(&p_state_mutex);
+
+ return empty;
+#else
+ return p_wait_events.empty();
+#endif
+}
+
+/******************************************************************************
+* void Event::reference, dereference
+* This should be protected, since main thread and worker threads could all
+* updating the event reference count
+******************************************************************************/
+void Event::reference()
+{
+ pthread_mutex_lock(&p_state_mutex);
+ Object::reference();
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+bool Event::dereference()
+{
+ bool retval = false;
+ pthread_mutex_lock(&p_state_mutex);
+ retval = Object::dereference();
+ pthread_mutex_unlock(&p_state_mutex);
+ return retval;
+}
+
+/******************************************************************************
+* void Event::setDeviceData
+******************************************************************************/
+void Event::setDeviceData(void *data)
+{
+ p_device_data = data;
+}
+
+/******************************************************************************
+* void Event::updateTiming
+******************************************************************************/
+void Event::updateTiming(Timing timing)
+{
+ if (timing >= Max)
+ return;
+
+ pthread_mutex_lock(&p_state_mutex);
+
+ // Don't update more than one time (NDRangeKernel for example)
+ if (p_timing[timing])
+ {
+ pthread_mutex_unlock(&p_state_mutex);
+ return;
+ }
+
+ struct timespec tp;
+ cl_ulong rs;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0)
+ clock_gettime(CLOCK_REALTIME, &tp);
+
+ rs = tp.tv_nsec / 1000; // convert to microseconds
+ rs += tp.tv_sec * 1000000; // convert to microseconds
+
+ p_timing[timing] = rs;
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* Event::Status Event::status() const
+******************************************************************************/
+Event::Status Event::status() const
+{
+ // HACK : We need const qualifier but we also need to lock a mutex
+ Event *me = (Event *)(void *)this;
+
+ pthread_mutex_lock(&me->p_state_mutex);
+
+ Status ret = p_status;
+
+ pthread_mutex_unlock(&me->p_state_mutex);
+
+ return ret;
+}
+
+/******************************************************************************
+* void Event::waitForStatus(Status status)
+******************************************************************************/
+void Event::waitForStatus(Status status)
+{
+ pthread_mutex_lock(&p_state_mutex);
+
+ while (p_status != status && p_status > 0)
+ {
+ pthread_cond_wait(&p_state_change_cond, &p_state_mutex);
+ }
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* void *Event::deviceData()
+******************************************************************************/
+void *Event::deviceData()
+{
+ return p_device_data;
+}
+
+/******************************************************************************
+* void Event::setCallback
+******************************************************************************/
+void Event::setCallback(cl_int command_exec_callback_type,
+ event_callback callback,
+ void *user_data)
+{
+ CallbackData data;
+
+ data.callback = callback;
+ data.user_data = user_data;
+
+ pthread_mutex_lock(&p_state_mutex);
+
+ p_callbacks.insert(std::pair<Status, CallbackData>(
+ (Status)command_exec_callback_type,
+ data));
+
+ pthread_mutex_unlock(&p_state_mutex);
+}
+
+/******************************************************************************
+* cl_int Event::info
+******************************************************************************/
+cl_int Event::info(cl_event_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_command_queue cl_command_queue_var;
+ cl_context cl_context_var;
+ cl_command_type cl_command_type_var;
+ cl_int cl_int_var;
+ cl_uint cl_uint_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_EVENT_COMMAND_QUEUE:
+ SIMPLE_ASSIGN(cl_command_queue, parent());
+ break;
+
+ case CL_EVENT_CONTEXT:
+ if (parent())
+ {
+ SIMPLE_ASSIGN(cl_context, parent()->parent());
+ }
+ else
+ {
+ if (type() == User)
+ SIMPLE_ASSIGN(cl_context, ((UserEvent *)this)->context())
+ else
+ SIMPLE_ASSIGN(cl_context, 0);
+ }
+ break;
+
+ case CL_EVENT_COMMAND_TYPE:
+ SIMPLE_ASSIGN(cl_command_type, type());
+ break;
+
+ // avoid status() call, if called from callbacks, we deadlock on mutex
+ case CL_EVENT_COMMAND_EXECUTION_STATUS:
+ SIMPLE_ASSIGN(cl_int, p_status);
+ break;
+
+ case CL_EVENT_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* cl_int Event::profilingInfo(
+******************************************************************************/
+cl_int Event::profilingInfo(cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ if (type() == Event::User)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ // Check that the Command Queue has profiling enabled
+ cl_command_queue_properties queue_props;
+ cl_int rs;
+
+ rs = ((CommandQueue *)parent())->info(CL_QUEUE_PROPERTIES,
+ sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ if ((queue_props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ // avoid status() call, if called from callbacks, we deadlock on mutex
+ if (p_status != Event::Complete)
+ return CL_PROFILING_INFO_NOT_AVAILABLE;
+
+ void *value = 0;
+ size_t value_length = 0;
+ cl_ulong cl_ulong_var;
+
+ switch (param_name)
+ {
+ case CL_PROFILING_COMMAND_QUEUED:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Queue]);
+ break;
+
+ case CL_PROFILING_COMMAND_SUBMIT:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Submit]);
+ break;
+
+ case CL_PROFILING_COMMAND_START:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[Start]);
+ break;
+
+ case CL_PROFILING_COMMAND_END:
+ SIMPLE_ASSIGN(cl_ulong, 1000*p_timing[End]);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
diff --git a/src/core/commandqueue.h b/src/core/commandqueue.h
new file mode 100644
index 0000000..7d2c65e
--- /dev/null
+++ b/src/core/commandqueue.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file commandqueue.h
+ * \brief Command queue and base class for events
+ */
+
+#ifndef __COMMANDQUEUE_H__
+#define __COMMANDQUEUE_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+#include <pthread.h>
+
+#include <map>
+#include <list>
+#include <vector>
+
+namespace Coal
+{
+
+class Context;
+class DeviceInterface;
+class Event;
+
+/**
+ * \brief Command queue
+ *
+ * This class holds a list of events that will be pushed on a given device.
+ *
+ * More details are given on the \ref events page.
+ */
+class CommandQueue : public Object
+{
+ public:
+ CommandQueue(Context *ctx,
+ DeviceInterface *device,
+ cl_command_queue_properties properties,
+ cl_int *errcode_ret);
+ ~CommandQueue();
+
+ /**
+ * \brief Queue an event
+ * \param event event to be queued
+ * \return \c CL_SUCCESS if success, otherwise an error code
+ */
+ cl_int queueEvent(Event *event);
+
+ /**
+ * \brief Information about the command queue
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Set properties of the command queue
+ * \note This function is deprecated and only there for OpenCL 1.0
+ * compatibility
+ * \param properties property to enable or disable
+ * \param enable true to enable the property, false to disable it
+ * \param old_properties old value of the properties, ignored if NULL
+ * \return \c CL_SUCCESS if all is good, an error code if \p properties is
+ * invalid
+ */
+ cl_int setProperty(cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties *old_properties);
+
+ /**
+ * \brief Check the properties given
+ * \return \c CL_SUCCESS if they are valid, an error code otherwise
+ */
+ cl_int checkProperties() const;
+
+ /**
+ * \brief Push events on the device
+ *
+ * This function implements a big part of what is described in
+ * \ref events .
+ *
+ * It is called by \c Coal::Event::setStatus() when an event is
+ * completed, or by \c queueEvent(). Its purpose is to explore the list
+ * of queued events (\c p_events) and to call
+ * \c Coal::DeviceInterface::pushEvent() for each event meeting its push
+ * conditions.
+ *
+ * \param ready_event is know to be pushable, push events in the
+ * queue till this point, skip the events after this one.
+ *
+ * \param one_event_completed_on_device can be used to differentiate
+ * whether this function is called by worker thread when an event is
+ * completed, or by main thread's queueEvent().
+ *
+ * \section conditions Conditions
+ *
+ * If the command queue has the \c CL_OUT_OF_ORDER_EXEC_MODE_ENABLE
+ * property disabled, an event can be pushed only if all the previous
+ * ones in the list are completed with success. This way, an event
+ * must be completed before any other can be pushed. This ensures
+ * in-order execution.
+ *
+ * If this property is enable, more complex heuristics are used.
+ *
+ * The event list \c p_events is explored from top to bottom. At each
+ * loop iteration, checks are performed to see if the event can be pushed.
+ *
+ * - When a \c Coal::BarrierEvent is encountered, no more events can be
+ * pushed, except if the \c Coal::BarrierEvent is the first in the list,
+ * as that means there are no other events that can be pushed, so the
+ * barrier can go away
+ * - All events that are already pushed or finished are skipped
+ * - The wait list of the event is then explored to ensure that all its
+ * dependencies are met.
+ * - Finally, if the events passes all the tests, it is either pushed on
+ * the device, or simply set to \c Coal::Event::Complete if it's a
+ * dummy event (see \c Coal::Event::isInstantaneous()).
+ */
+ void pushEventsOnDevice(Event *ready_event = NULL,
+ bool one_event_completed_on_device = false);
+
+ /**
+ * \brief Push an event onto p_release_event list
+ *
+ * Later main thread will perform release event action.
+ */
+ void releaseEvent(Event *e);
+
+ /**
+ * \brief Remove from the event list completed events
+ *
+ * This function is called periodically to clean the event list from
+ * completed events.
+ *
+ * It is needed to do that out of \c pushEventsOnDevice() as deleting
+ * event may \c dereference() this command queue, and also delete it. It
+ * would produce crashes.
+ */
+ void cleanEvents();
+
+ /**
+ * \brief Release events on the released event list
+ *
+ * This function is called periodically to release the events on the
+ * released events list. This is only performed on the main thread
+ * because deleting/freeing memory from worker thread has caused
+ * weird memory problems on ARM.
+ *
+ */
+ void cleanReleasedEvents();
+
+ /**
+ * \brief Flush the command queue
+ *
+ * Pushes all the events on the device, and then return. The event
+ * don't need to be completed after this call.
+ */
+ void flush();
+
+ /**
+ * \brief Finish the command queue
+ *
+ * Pushes the events like \c flush() but also wait for them to be
+ * completed before returning.
+ */
+ void finish();
+
+ /**
+ * \brief Return all the events in the command queue
+ * \note Retains all the events
+ * \param count number of events in the event queue
+ * \param include_completed_events default to true
+ * \return events currently in the event queue
+ */
+ Event **events(unsigned int &count,
+ bool include_completed_events = true);
+
+ private:
+ DeviceInterface *p_device;
+ cl_int p_num_events_in_queue;
+ cl_int p_num_events_on_device;
+ cl_int p_num_events_completed;
+ cl_command_queue_properties p_properties;
+
+ std::list<Event *> p_events;
+ std::list<Event *> p_released_events;
+ pthread_mutex_t p_event_list_mutex;
+ pthread_cond_t p_event_list_cond;
+ bool p_flushed;
+};
+
+/**
+ * \brief Base class for all events
+ *
+ * This class contains logic common to all the events.
+ *
+ * Beside handling OpenCL-specific stuff, \c Coal::Event objects do nothing
+ * implementation-wise. They do not compile kernels, copy data around, etc.
+ * They only contain static and immutable data that is then used by the devices
+ * to actually implement the event.
+ */
+class Event : public Object
+{
+ public:
+ /**
+ * \brief Event type
+ *
+ * The allows objects using \c Coal::Event to know which event it is,
+ * and to cast it to the correct sub-class.
+ */
+ enum Type
+ {
+ NDRangeKernel = CL_COMMAND_NDRANGE_KERNEL,
+ TaskKernel = CL_COMMAND_TASK,
+ NativeKernel = CL_COMMAND_NATIVE_KERNEL,
+ ReadBuffer = CL_COMMAND_READ_BUFFER,
+ WriteBuffer = CL_COMMAND_WRITE_BUFFER,
+ CopyBuffer = CL_COMMAND_COPY_BUFFER,
+ ReadImage = CL_COMMAND_READ_IMAGE,
+ WriteImage = CL_COMMAND_WRITE_IMAGE,
+ CopyImage = CL_COMMAND_COPY_IMAGE,
+ CopyImageToBuffer = CL_COMMAND_COPY_IMAGE_TO_BUFFER,
+ CopyBufferToImage = CL_COMMAND_COPY_BUFFER_TO_IMAGE,
+ MapBuffer = CL_COMMAND_MAP_BUFFER,
+ MapImage = CL_COMMAND_MAP_IMAGE,
+ UnmapMemObject = CL_COMMAND_UNMAP_MEM_OBJECT,
+ Marker = CL_COMMAND_MARKER,
+ AcquireGLObjects = CL_COMMAND_ACQUIRE_GL_OBJECTS,
+ ReleaseGLObjects = CL_COMMAND_RELEASE_GL_OBJECTS,
+ ReadBufferRect = CL_COMMAND_READ_BUFFER_RECT,
+ WriteBufferRect = CL_COMMAND_WRITE_BUFFER_RECT,
+ CopyBufferRect = CL_COMMAND_COPY_BUFFER_RECT,
+ User = CL_COMMAND_USER,
+ Barrier,
+ WaitForEvents
+ };
+
+ /**
+ * \brief Event status
+ */
+ enum Status
+ {
+ Queued = CL_QUEUED, /*!< \brief Simply queued in a command queue */
+ Submitted = CL_SUBMITTED, /*!< \brief Submitted to a device */
+ Running = CL_RUNNING, /*!< \brief Running on the device */
+ Complete = CL_COMPLETE /*!< \brief Completed */
+ };
+
+ /**
+ * \brief Function that can be called when an event change status
+ */
+ typedef void (CL_CALLBACK *event_callback)(cl_event, cl_int, void *);
+
+ /**
+ * Structure used internally by \c Coal::Event to store for each event
+ * status the callbacks to call with the corresponding \c user_data.
+ */
+ struct CallbackData
+ {
+ event_callback callback; /*!< Function to call */
+ void *user_data; /*!< Pointer to pass as its third argument */
+ };
+
+ /**
+ * \brief Timing counters of an event
+ */
+ enum Timing
+ {
+ Queue, /*!< Time when the event was queued */
+ Submit, /*!< Time when the event was submitted to the device */
+ Start, /*!< Time when its execution began on the device */
+ End, /*!< Time when its execution finished */
+ Max /*!< Number of items in this enum */
+ };
+
+ public:
+ /**
+ * \brief Constructor
+ * \param parent parent \c Coal::CommandQueue
+ * \param status \c Status the event has when it is created
+ * \param num_events_in_wait_list number of events to wait on
+ * \param event_wait_list list of events to wait on
+ * \param errcode_ret return value
+ */
+ Event(CommandQueue *parent,
+ Status status,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ void freeDeviceData(); /*!< \brief Call \c Coal::DeviceInterface::freeEventDeviceData() */
+ virtual ~Event(); /*!< \brief Destructor */
+
+ /**
+ * \brief Type of the event
+ * \return type of the event
+ */
+ virtual Type type() const = 0;
+
+ /**
+ * \brief Dummy event
+ *
+ * A dummy event is an event that doesn't have to be pushed on a device,
+ * it is only a hint for \c Coal::CommandQueue
+ *
+ * \return true if the event is dummy
+ */
+ bool isInstantaneous() const;
+
+ /**
+ * \brief Set the event status
+ *
+ * This function calls the event callbacks, and
+ * \c Coal::CommandQueue::pushEventsOnDevice() if \p status is
+ * \c Complete .
+ *
+ * \param status new status of the event
+ */
+ void setStatus(Status status);
+
+ /**
+ * \brief Increase Event reference count
+ *
+ * This function uses mutex to protect the reference count
+ * \c update in the underlying object.
+ */
+ void reference();
+
+ /**
+ * \brief Decrease Event reference count
+ *
+ * This function uses mutex to protect the reference count
+ * \c update in the underlying object.
+ *
+ * \return true if the reference count is decreased to 0
+ */
+ bool dereference();
+
+ /**
+ * \brief Set device-specific data
+ * \param data device-specific data
+ */
+ void setDeviceData(void *data);
+
+ /**
+ * \brief Update timing info
+ *
+ * This function reads current system time and puts it in \c p_timing
+ *
+ * \param timing timing event having just finished
+ */
+ void updateTiming(Timing timing);
+
+ /**
+ * \brief Status
+ * \return status of the event
+ */
+ Status status() const;
+
+ /**
+ * \brief Wait for a specified status
+ *
+ * This function blocks until the event's status is set to \p status
+ * by another thread.
+ *
+ * \param status the status the event must have for the function to return
+ */
+ void waitForStatus(Status status);
+
+ /**
+ * \brief Device-specific data
+ * \return data set using \c setDeviceData()
+ */
+ void *deviceData();
+
+ /**
+ * \brief Add a callback for this event
+ * \param command_exec_callback_type status the event must have in order
+ * to have the callback called
+ * \param callback callback function
+ * \param user_data user data given to the callback
+ */
+ void setCallback(cl_int command_exec_callback_type,
+ event_callback callback,
+ void *user_data);
+
+ /**
+ * \brief Info about the event
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_event_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Profiling info
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int profilingInfo(cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Call \c Coal::CommandQueue::pushEventsOnDevice() for each command queue
+ * in which this event is queued or each queue with an event waiting on this event
+ */
+ void flushQueues();
+
+
+ /**
+ * \brief Add event to p_dependent_events, which will be notified when
+ * current event completes. If current event is already complete,
+ * no need to add and return false.
+ * \param event the event to be notified
+ */
+ bool addDependentEvent(Event *event);
+
+ /**
+ * \brief Remove event from p_wait_events, which should be waited on
+ * before current event can start. When p_wait_events becomes empty,
+ * return true to indicate that current event is ready to be pushed.
+ * \param event the event to be removed from p_wait_events
+ */
+ bool removeWaitEvent(Event *event);
+
+ /**
+ * \brief Check if there are no more events to wait on before current
+ * event can start.
+ */
+ bool waitEventsAllCompleted();
+
+ private:
+ /**
+ * \brief Helper function for setStatus()
+ * return number of dependent events
+ */
+ int setStatusHelper(Status status);
+
+ private:
+ pthread_cond_t p_state_change_cond;
+ pthread_mutex_t p_state_mutex;
+
+ Status p_status;
+ void *p_device_data;
+ std::multimap<Status, CallbackData> p_callbacks;
+
+ cl_uint p_timing[Max];
+
+ // p_wait_events: I should wait after these events complete
+ // p_dependent_events: when I complete, I should notify these events
+ std::list<const Event *> p_wait_events;
+ std::vector<Event *> p_dependent_events;
+};
+
+}
+
+struct _cl_command_queue : public Coal::CommandQueue
+{};
+
+struct _cl_event : public Coal::Event
+{};
+
+#endif
diff --git a/src/core/compiler.cpp b/src/core/compiler.cpp
new file mode 100644
index 0000000..d4d5240
--- /dev/null
+++ b/src/core/compiler.cpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.cpp
+ * \brief Compiler wrapper around Clang
+ */
+
+#include "compiler.h"
+#include "deviceinterface.h"
+
+#include <cstring>
+#include <cstdio>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <clang/Frontend/CompilerInvocation.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Frontend/LangStandard.h>
+#include <clang/Basic/Diagnostic.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/MemoryBuffer.h> // ASW
+#include <llvm/IR/Module.h>
+#include <llvm/IR/LLVMContext.h>
+#include <sys/stat.h>
+
+std::string get_ocl_dsp();
+
+using namespace Coal;
+
+Compiler::Compiler(DeviceInterface *device)
+: p_device(device), p_module(0), p_optimize(true), p_log_stream(p_log),
+ p_log_printer(0)
+{
+}
+
+Compiler::~Compiler()
+{
+
+}
+
+int Compiler::compile(const std::string &options,
+ llvm::MemoryBuffer *source)
+{
+ /* Set options */
+ p_options = options;
+
+ clang::CodeGenOptions &codegen_opts = p_compiler.getCodeGenOpts();
+ clang::DiagnosticOptions &diag_opts = p_compiler.getDiagnosticOpts();
+ clang::FrontendOptions &frontend_opts = p_compiler.getFrontendOpts();
+ clang::HeaderSearchOptions &header_opts = p_compiler.getHeaderSearchOpts();
+ clang::LangOptions &lang_opts = p_compiler.getLangOpts();
+ clang::TargetOptions &target_opts = p_compiler.getTargetOpts();
+ clang::PreprocessorOptions &prep_opts = p_compiler.getPreprocessorOpts();
+ clang::CompilerInvocation &invocation = p_compiler.getInvocation();
+
+ // Set codegen options
+ codegen_opts.setDebugInfo(clang::CodeGenOptions::NoDebugInfo);
+ codegen_opts.AsmVerbose = true;
+ codegen_opts.CodeModel = "default";
+
+ // level 3 is too much for the pocl transformations.
+ codegen_opts.OptimizationLevel = 2;
+
+ // Set diagnostic options
+ diag_opts.Pedantic = true;
+ diag_opts.ShowColumn = true;
+ diag_opts.ShowLocation = true;
+ diag_opts.ShowCarets = false;
+ diag_opts.ShowFixits = true;
+ diag_opts.ShowColors = false;
+ diag_opts.ErrorLimit = 19;
+ diag_opts.MessageLength = 0;
+
+ // Set frontend options
+ frontend_opts.ProgramAction = clang::frontend::EmitLLVMOnly;
+ frontend_opts.DisableFree = true;
+
+ // Set header search options
+ header_opts.Verbose = false;
+ header_opts.UseBuiltinIncludes = false;
+ header_opts.UseStandardSystemIncludes = false;
+ header_opts.UseStandardCXXIncludes = false;
+
+ // Set preprocessor options
+ prep_opts.RetainRemappedFileBuffers = true;
+ //prep_opts.ImplicitPCHInclude = "/usr/share/ti/opencl/clc.h";
+ prep_opts.Includes.push_back("clc.h");
+ prep_opts.Includes.push_back(p_device->builtinsHeader());
+
+ // Set lang options
+ lang_opts.NoBuiltin = true;
+ lang_opts.OpenCL = true;
+ lang_opts.CPlusPlus = false;
+
+ // Set target options
+ cl_device_type devtype;
+ p_device->info(CL_DEVICE_TYPE, sizeof(devtype), &devtype, 0);
+
+ if (devtype == CL_DEVICE_TYPE_CPU) {
+ // Originally: target_opts.Triple = llvm::sys::getHostTriple();
+ target_opts.Triple = llvm::sys::getDefaultTargetTriple();
+ }
+ else // devtype != CL_DEVICE_TYPE_CPU
+ {
+ // For 6X, use the 'spir' target, since it implements opencl specs
+ target_opts.Triple = "spir-unknown-unknown-unknown";
+
+ // Currently, llp6x does not handle fused multiply and add
+ // llvm intrinsics (llvm.fmuladd.*). Disable generating these
+ // intrinsics using clang -ffp-contract=off option
+ codegen_opts.setFPContractMode(clang::CodeGenOptions::FPC_Off);
+ }
+
+ // Parse the user options
+ std::istringstream options_stream(options);
+ std::string token;
+ bool Werror = false, inI = false, inD = false;
+
+#ifndef SHAMROCK_BUILD
+ // Add opencl-headers' package default install include path as location to search
+ std::string header_path(get_ocl_dsp());
+#else // TODO: /usr/include/CL is where opencl headers go, but use ENV vars?
+ std::string header_path("/usr/include/CL");
+#endif
+ header_opts.AddPath(header_path, clang::frontend::Angled, false, false);
+
+
+ while (options_stream >> token)
+ {
+ if (inI)
+ {
+ // token is an include path
+ header_opts.AddPath(token, clang::frontend::Angled, false, false);
+ inI = false;
+ continue;
+ }
+ else if (inD)
+ {
+ // token is name or name=value
+ prep_opts.addMacroDef(token);
+ inD = false;
+ continue;
+ }
+
+ //Handle -I xxx or -Ixxx. Assuming no other -I option prefix
+ if (token == "-I")
+ {
+ inI = true;
+ }
+ else if (token.compare(0,2,"-I") == 0)
+ {
+ header_opts.AddPath(token.substr(2), clang::frontend::Angled, false,
+ false);
+ }
+ //Handle -D xxx or -Dxxx. Assuming no other -D option prefix
+ else if (token == "-D")
+ {
+ inD = true;
+ }
+ else if (token.compare(0,2,"-D") == 0) //Handle -Dxxx (no space between)
+ {
+ prep_opts.addMacroDef(token.substr(2));
+ }
+ else if (token == "-cl-single-precision-constant")
+ {
+ lang_opts.SinglePrecisionConstants = true;
+ }
+ else if (token == "-cl-opt-disable")
+ {
+ p_optimize = false;
+ codegen_opts.OptimizationLevel = 0;
+ }
+ else if (token == "-cl-mad-enable")
+ {
+ codegen_opts.LessPreciseFPMAD = true;
+ }
+ else if (token == "-cl-unsafe-math-optimizations")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ }
+ else if (token == "-cl-finite-math-only")
+ {
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ }
+ else if (token == "-cl-fast-relaxed-math")
+ {
+ codegen_opts.UnsafeFPMath = true;
+ codegen_opts.NoInfsFPMath = true;
+ codegen_opts.NoNaNsFPMath = true;
+ lang_opts.FastRelaxedMath = true;
+ }
+ else if (token == "-w")
+ {
+ diag_opts.IgnoreWarnings = true;
+ }
+ else if (token == "-Werror")
+ {
+ Werror = true;
+ }
+ else if (token == "-cl-std=CL1.1")
+ {
+ }
+ else
+ {
+ return CL_INVALID_BUILD_OPTIONS;
+ }
+ }
+
+ add_macrodefs_for_supported_opencl_extensions(prep_opts);
+
+ // Set invocation options
+ //invocation.setLangDefaults(lang_opts,clang::IK_OpenCL);
+ invocation.setLangDefaults(lang_opts,clang::IK_OpenCL, clang::LangStandard::lang_opencl12);
+
+ // Create the diagnostics engine
+ p_log_printer = new clang::TextDiagnosticPrinter(p_log_stream, &diag_opts);
+ p_compiler.createDiagnostics(p_log_printer);
+
+ if (!p_compiler.hasDiagnostics())
+ return false;
+
+ p_compiler.getDiagnostics().setWarningsAsErrors(Werror);
+
+ // Feed the compiler with source
+ frontend_opts.Inputs.push_back(clang::FrontendInputFile("program.cl", clang::IK_OpenCL));
+
+ //ASW TODO cleanup
+#if 0
+ prep_opts.addRemappedFile("program.cl", source);
+#else
+
+ const llvm::StringRef s_data(source->getBuffer());
+ const llvm::StringRef s_name("<source>");
+ llvm::MemoryBuffer *buffer =
+ llvm::MemoryBuffer::getMemBuffer(s_data, s_name);
+
+ prep_opts.addRemappedFile("program.cl", buffer);
+#endif
+
+ //timespec t0, t1;
+ //clock_gettime(CLOCK_MONOTONIC, &t0);
+ // Compile
+
+ clang::CodeGenAction *Act = new clang::EmitLLVMOnlyAction(&llvm::getGlobalContext());
+ if (!p_compiler.ExecuteAction(*Act))
+ {
+ // DEBUG
+ std::cout << log() << std::endl;
+ return true;
+ }
+
+ //clock_gettime(CLOCK_MONOTONIC, &t1);
+ //printf("clang time: %6.4f secs\n",
+ //(float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9);
+
+ p_log_stream.flush();
+ p_module = Act->takeModule();
+
+ // uncomment to debug the llvm IR
+ // p_module->dump();
+
+ return false;
+}
+
+// Query the device to get list of supported OpenCL extensions. Standard
+// requires that each supported extension has a macro definition with the
+// same name as the extension
+void Compiler::add_macrodefs_for_supported_opencl_extensions
+ (clang::PreprocessorOptions &prep_opts)
+{
+ // Get the extensions string for the device
+ size_t size;
+ p_device->info(CL_DEVICE_EXTENSIONS, 0, NULL, &size);
+
+ char *extensions = new char[size + 1];
+ memset( extensions, CHAR_MIN, sizeof(char)*(size+1) );
+
+ p_device->info(CL_DEVICE_EXTENSIONS, sizeof(char)*size, extensions, NULL);
+
+ // Create macro definitions from the extension names
+ std::istringstream extensions_stream(extensions);
+ std::string token;
+
+ while (extensions_stream >> token)
+ prep_opts.addMacroDef(token);
+
+ delete [] extensions;
+}
+
+const std::string &Compiler::log() const
+{
+ return p_log;
+}
+
+const std::string &Compiler::options() const
+{
+ return p_options;
+}
+
+bool Compiler::optimize() const
+{
+ return p_optimize;
+}
+
+llvm::Module *Compiler::module() const
+{
+ return p_module;
+}
+
+void Compiler::appendLog(const std::string &log)
+{
+ p_log += log;
+}
diff --git a/src/core/compiler.h b/src/core/compiler.h
new file mode 100644
index 0000000..58788e6
--- /dev/null
+++ b/src/core/compiler.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file compiler.h
+ * \brief Compiler wrapped around Clang
+ */
+
+#ifndef __COMPILER_H__
+#define __COMPILER_H__
+
+#include <string>
+
+#include <clang/Frontend/CompilerInstance.h>
+#include <llvm/Support/raw_ostream.h>
+
+namespace llvm
+{
+ class MemoryBuffer;
+ class Module;
+}
+
+namespace clang
+{
+ class TextDiagnosticPrinter;
+}
+
+namespace Coal
+{
+
+class DeviceInterface;
+
+/**
+ * \brief Compiler using Clang
+ *
+ * This class builds a Clang instance, runs it and then retains compilation logs
+ * and produced data.
+ */
+class Compiler
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device \c Coal::DeviceInterface for which code will be compiled
+ */
+ Compiler(DeviceInterface *device);
+ ~Compiler();
+
+ /**
+ * \brief Compile \p source to produce a LLVM module
+ * \param options options given to the compiler, described in the OpenCL spec
+ * \param source source to be compiled
+ * \return true if the compilation is successful, false otherwise
+ * 2 if illegal options
+ * \sa module()
+ * \sa log()
+ */
+ int compile(const std::string &options, llvm::MemoryBuffer *source);
+
+ /**
+ * \brief Compilation log
+ * \note \c appendLog() can also be used to append custom info at the end
+ * of the log, for instance to keep compilation and linking logs
+ * in the same place
+ * \return log
+ */
+ const std::string &log() const;
+
+ /**
+ * \brief Options given at \c compile()
+ * \return options used during compilation
+ */
+ const std::string &options() const;
+
+ /**
+ * \brief Optimization enabled
+ * \return true if -cl-opt-disable was given in the options, false otherwise
+ */
+ bool optimize() const;
+
+ /**
+ * \brief LLVM module generated
+ * \return LLVM module generated by the compilation, 0 if an error occured
+ */
+ llvm::Module *module() const;
+
+ /**
+ * \brief Append a string to the log
+ *
+ * This function can be used to append linking or code-gen logs to the
+ * internal compilation log kept by this class
+ *
+ * \param log log to be appended
+ */
+ void appendLog(const std::string &log);
+
+ private:
+ DeviceInterface *p_device;
+ clang::CompilerInstance p_compiler;
+ llvm::Module *p_module;
+ bool p_optimize;
+
+ std::string p_log, p_options;
+ llvm::raw_string_ostream p_log_stream;
+ clang::TextDiagnosticPrinter *p_log_printer;
+
+ void add_macrodefs_for_supported_opencl_extensions
+ (clang::PreprocessorOptions &prep_opts);
+
+};
+
+}
+
+#endif
diff --git a/src/core/config.h b/src/core/config.h
new file mode 100644
index 0000000..e1e401b
--- /dev/null
+++ b/src/core/config.h
@@ -0,0 +1,9 @@
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#define LLVM_VERSION "3.5.0svn"
+#define COAL_VERSION ""
+
+#define MAX_WORK_DIMS 3
+
+#endif
diff --git a/src/core/config.h.cmake b/src/core/config.h.cmake
new file mode 100644
index 0000000..ccf87b7
--- /dev/null
+++ b/src/core/config.h.cmake
@@ -0,0 +1,9 @@
+#ifndef __CONFIG_H__
+#define __CONFIG_H__
+
+#define LLVM_VERSION "@LLVM_VERSION@"
+#define COAL_VERSION "@Coal_VERSION@"
+
+#define MAX_WORK_DIMS 3
+
+#endif
diff --git a/src/core/context.cpp b/src/core/context.cpp
new file mode 100644
index 0000000..e9129ff
--- /dev/null
+++ b/src/core/context.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file context.cpp
+ * \brief Context
+ */
+
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "platform.h"
+
+#include <cstring>
+#include <cstdlib>
+
+#include <llvm/Support/TargetSelect.h>
+
+using namespace Coal;
+
+static void default_pfn_notify(const char *, const void *, size_t, void *)
+{
+ return;
+}
+
+Context::Context(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *,
+ size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret)
+: Object(Object::T_Context, 0), p_properties(0), p_pfn_notify(pfn_notify),
+ p_user_data(user_data), p_devices(0), p_num_devices(0), p_props_len(0),
+ p_platform(&the_platform)
+{
+ if (!p_pfn_notify)
+ p_pfn_notify = &default_pfn_notify;
+
+ // Intialize LLVM, this can be done more than one time per program
+ llvm::InitializeNativeTarget();
+ llvm::InitializeNativeTargetAsmPrinter();
+ llvm::InitializeNativeTargetAsmParser();
+
+ // Explore the properties
+ if (properties)
+ {
+ const unsigned char *props = (const unsigned char *)properties;
+ cl_context_properties prop;
+ size_t props_len = 0;
+
+#define GET_PROP(type, var) \
+ var = *(const type *)props; \
+ props += sizeof(type); \
+ props_len += sizeof(type);
+
+ int propset = 0;
+ while (true)
+ {
+ GET_PROP(cl_context_properties, prop)
+
+ if (!prop)
+ break;
+
+ switch (prop)
+ {
+ case CL_CONTEXT_PLATFORM:
+ if (!propset)
+ {
+ GET_PROP(cl_platform_id, p_platform);
+ propset = 1;
+ }
+ else
+ {
+ *errcode_ret = CL_INVALID_PROPERTY;
+ return;
+ }
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_PROPERTY;
+ return;
+ }
+ }
+
+ // properties may be allocated on the stack of the client application
+ // copy it into a real buffer
+ p_properties = (cl_context_properties *)std::malloc(props_len);
+ p_props_len = props_len;
+
+ if (!p_properties)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ std::memcpy((void *)p_properties, (const void *)properties, props_len);
+ }
+
+ // Verify that the platform is good
+ if (p_platform != &the_platform)
+ {
+ *errcode_ret = CL_INVALID_PLATFORM;
+ return;
+ }
+
+ // Explore the devices
+ p_devices = (DeviceInterface **)std::malloc(num_devices * sizeof(DeviceInterface *));
+ p_num_devices = num_devices;
+
+ if (!p_devices)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ cl_device_id device = devices[i];
+
+ if (device == 0)
+ {
+ *errcode_ret = CL_INVALID_DEVICE;
+ return;
+ }
+
+ // Verify that the device is available
+ cl_bool device_available;
+
+ *errcode_ret = device->info(CL_DEVICE_AVAILABLE,
+ sizeof(device_available),
+ &device_available,
+ 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!device_available)
+ {
+ *errcode_ret = CL_DEVICE_NOT_AVAILABLE;
+ return;
+ }
+
+ // Add the device to the list
+ p_devices[i] = (DeviceInterface *)device;
+ }
+}
+
+Context::~Context()
+{
+ if (p_properties)
+ std::free((void *)p_properties);
+
+ if (p_devices)
+ std::free((void *)p_devices);
+}
+
+cl_int Context::info(cl_context_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_CONTEXT_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_CONTEXT_NUM_DEVICES:
+ SIMPLE_ASSIGN(cl_uint, p_num_devices);
+ break;
+
+ case CL_CONTEXT_DEVICES:
+ MEM_ASSIGN(p_num_devices * sizeof(DeviceInterface *), p_devices);
+ break;
+
+ case CL_CONTEXT_PROPERTIES:
+ MEM_ASSIGN(p_props_len, p_properties);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value && value_length /* CONTEXT_PROPERTIES can be of length 0 */)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+bool Context::hasDevice(DeviceInterface *device) const
+{
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ if (p_devices[i] == device)
+ return true;
+
+ return false;
+}
diff --git a/src/core/context.h b/src/core/context.h
new file mode 100644
index 0000000..4712d25
--- /dev/null
+++ b/src/core/context.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file context.h
+ * \brief OpenCL context
+ */
+
+#ifndef __CONTEXT_H__
+#define __CONTEXT_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+
+namespace Coal
+{
+
+class DeviceInterface;
+
+/**
+ * \brief OpenCL context
+ *
+ * This class is the root of all OpenCL objects, except \c Coal::DeviceInterface.
+ */
+class Context : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param properties properties of the context
+ * \param num_devices number of devices that will be used
+ * \param devices \c Coal::DeviceInterface to be used
+ * \param pfn_notify function to call when an error arises, to give
+ * more detail
+ * \param user_data user data to pass to \p pfn_notify
+ * \param errcode_ret return code
+ */
+ Context(const cl_context_properties *properties,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *,
+ size_t, void *),
+ void *user_data,
+ cl_int *errcode_ret);
+ ~Context();
+
+ /**
+ * \brief Info about the context
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_context_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Check that this context contains a given \p device
+ * \param device device to check
+ * \return whether this context contains \p device
+ */
+ bool hasDevice(DeviceInterface *device) const;
+
+ private:
+ cl_context_properties *p_properties;
+ void (CL_CALLBACK *p_pfn_notify)(const char *, const void *,
+ size_t, void *);
+ void *p_user_data;
+
+ DeviceInterface **p_devices;
+ unsigned int p_num_devices, p_props_len;
+ cl_platform_id p_platform;
+};
+
+}
+
+struct _cl_context : public Coal::Context
+{};
+
+#endif
diff --git a/src/core/cpu/buffer.cpp b/src/core/cpu/buffer.cpp
new file mode 100644
index 0000000..9125872
--- /dev/null
+++ b/src/core/cpu/buffer.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/buffer.cpp
+ * \brief CPU buffer
+ */
+
+#include "buffer.h"
+#include "device.h"
+
+#include "../memobject.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+CPUBuffer::CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs)
+: DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0),
+ p_data_malloced(false)
+{
+ if (buffer->type() == MemObject::SubBuffer)
+ {
+ // We need to create this CPUBuffer based on the CPUBuffer of the
+ // parent buffer
+ SubBuffer *subbuf = (SubBuffer *)buffer;
+ MemObject *parent = subbuf->parent();
+ CPUBuffer *parentcpubuf = (CPUBuffer *)parent->deviceBuffer(device);
+
+ char *tmp_data = (char *)parentcpubuf->data();
+ tmp_data += subbuf->offset();
+
+ p_data = (void *)tmp_data;
+ }
+ else if (buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ // We use the host ptr, we are already allocated
+ p_data = buffer->host_ptr();
+ }
+
+ // NOTE: This function can also reject Image buffers by setting a value
+ // != CL_SUCCESS in rs.
+}
+
+CPUBuffer::~CPUBuffer()
+{
+ if (p_data_malloced)
+ {
+ std::free((void *)p_data);
+ }
+}
+
+void *CPUBuffer::data() const
+{
+ return p_data;
+}
+
+void *CPUBuffer::nativeGlobalPointer() const
+{
+ return data();
+}
+
+bool CPUBuffer::allocate()
+{
+ size_t buf_size = p_buffer->size();
+
+ if (buf_size == 0)
+ // Something went wrong...
+ return false;
+
+ if (!p_data)
+ {
+ // We don't use a host ptr, we need to allocate a buffer
+ p_data = std::malloc(buf_size);
+
+ if (!p_data)
+ return false;
+
+ p_data_malloced = true;
+ }
+
+ if (p_buffer->type() != MemObject::SubBuffer &&
+ p_buffer->flags() & CL_MEM_COPY_HOST_PTR)
+ {
+ std::memcpy(p_data, p_buffer->host_ptr(), buf_size);
+ }
+
+ // Say to the memobject that we are allocated
+ p_buffer->deviceAllocated(this);
+
+ return true;
+}
+
+DeviceInterface *CPUBuffer::device() const
+{
+ return p_device;
+}
+
+bool CPUBuffer::allocated() const
+{
+ return p_data != 0;
+}
diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h
new file mode 100644
index 0000000..d88c9e5
--- /dev/null
+++ b/src/core/cpu/buffer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file buffer.h
+ * \brief CPU buffer
+ */
+
+#ifndef __CPU_BUFFER_H__
+#define __CPU_BUFFER_H__
+
+#include "../deviceinterface.h"
+
+namespace Coal
+{
+
+class CPUDevice;
+class MemObject;
+
+/**
+ * \brief CPU implementation of \c Coal::MemObject
+ *
+ * This class is responsible of the actual allocation of buffer objects, using
+ * \c malloc() or by reusing a given \c host_ptr.
+ */
+class CPUBuffer : public DeviceBuffer
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device Device for which the buffer is allocated
+ * \param buffer \c Coal::MemObject holding information about the buffer
+ * \param rs return code (\c CL_SUCCESS if all is good)
+ */
+ CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs);
+ ~CPUBuffer();
+
+ bool allocate();
+ DeviceInterface *device() const;
+ void *data() const; /*!< \brief Pointer to the buffer's data */
+ void *nativeGlobalPointer() const;
+ bool allocated() const;
+
+ private:
+ CPUDevice *p_device;
+ MemObject *p_buffer;
+ void *p_data;
+ bool p_data_malloced;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
new file mode 100644
index 0000000..137d34e
--- /dev/null
+++ b/src/core/cpu/builtins.cpp
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/builtins.cpp
+ * \brief Native OpenCL C built-in functions
+ *
+ * All these built-ins are directly called by kernels. When the LLVM JIT
+ * sees a function name it doesn't know, it calls \c getBuiltin() with this
+ * name as parameter. This function then returns the address of an actual
+ * function implementation, that finally gets called by the kernel when
+ * it is run.
+ */
+
+#include "builtins.h"
+#include "kernel.h"
+#include "buffer.h"
+
+#include "../events.h"
+#include "../memobject.h"
+
+#include <sys/mman.h>
+#include <signal.h>
+
+#include <llvm/IR/Function.h>
+
+#include <iostream>
+#include <cstring>
+#include <cmath>
+#include <boost/math/special_functions.hpp>
+
+#include <stdio.h>
+
+using namespace Coal;
+
+unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
+ size_t row_pitch, size_t slice_pitch,
+ unsigned int bytes_per_pixel)
+{
+ unsigned char *result = base;
+
+ result += (z * slice_pitch) +
+ (y * row_pitch) +
+ (x * bytes_per_pixel);
+
+ return result;
+}
+
+/*
+ * TLS-related functions
+ */
+__thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */
+__thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */
+__thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+{
+ g_work_group = current;
+}
+
+void *getWorkItemsData(size_t &size)
+{
+ size = work_items_size;
+ return work_items_data;
+}
+
+void setWorkItemsData(void *ptr, size_t size)
+{
+ work_items_data = ptr;
+ work_items_size = size;
+}
+
+/*
+ * Actual built-ins implementations
+ */
+cl_uint CPUKernelWorkGroup::getWorkDim() const
+{
+ return p_work_dim;
+}
+
+size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
+{
+ if (dimindx >p_work_dim)
+ return 1;
+
+ return p_event->global_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return p_event->local_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return (p_event->global_work_size(dimindx) /
+ p_event->local_work_size(dimindx));
+}
+
+size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_index[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_event->global_work_offset(dimindx);
+}
+
+void CPUKernelWorkGroup::barrier(unsigned int flags)
+{
+ p_had_barrier = true;
+
+ // Allocate or reuse TLS memory for the stacks (it isn't freed between
+ // the work groups, and even the kernels, so if we need less space than
+ // allocated, it's good)
+ if (!p_contexts)
+ {
+ if (p_current_work_item != 0)
+ {
+ // Completely abnormal, it means that not every work-items
+ // encounter the barrier
+ std::cerr << "*** Not every work-items of "
+ << p_kernel->function()->getName().str()
+ << " calls barrier(); !" << std::endl;
+ return;
+ }
+
+ // Allocate or reuse the stacks
+ size_t contexts_size;
+ p_contexts = getWorkItemsData(contexts_size);
+ size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context));
+
+ if (!p_contexts || contexts_size < needed_size)
+ {
+ // We must allocate a new space
+ if (p_contexts)
+ munmap(p_contexts, contexts_size);
+
+ p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ setWorkItemsData(p_contexts, contexts_size);
+ }
+
+ // Now that we have a real main context, initialize it
+ p_current_context = getContextAddr(0);
+ p_current_context->initialized = 1;
+ std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t));
+
+ getcontext(&p_current_context->context);
+ }
+
+ // Take the next context
+ p_current_work_item++;
+ if (p_current_work_item == p_num_work_items) p_current_work_item = 0;
+
+ Context *next = getContextAddr(p_current_work_item);
+ Context *main = getContextAddr(0); // The context not created with makecontext
+
+ // If the next context isn't initialized, initialize it.
+ // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized
+ if (next->initialized == 0)
+ {
+ next->initialized = 1;
+
+ // local-id of next is the one of the current context, but incVec'ed
+ std::memcpy(next->local_id, p_current_context->local_id,
+ MAX_WORK_DIMS * sizeof(size_t));
+
+ incVec(p_work_dim, next->local_id, p_max_local_id);
+
+ // Initialize the next context
+ if (getcontext(&next->context) != 0)
+ return;
+
+ // Get its stack. It is located a next + sizeof(Context)
+ char *stack = (char *)next;
+ stack += sizeof(Context);
+
+ next->context.uc_link = &main->context;
+ next->context.uc_stack.ss_sp = stack;
+ next->context.uc_stack.ss_size = p_stack_size;
+
+ // Tell it to run the kernel function
+ makecontext(&next->context, (void (*)())p_kernel_func_addr, 1, p_args);
+ }
+
+ // Switch to the next context
+ ucontext_t *cur = &p_current_context->context;
+ p_current_context = next;
+
+ swapcontext(cur, &next->context);
+
+ // When we return here, it means that all the other work items encountered
+ // a barrier and that we returned to this one. We can continue.
+}
+
+void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
+{
+ std::cout << "OpenCL: Non-existant builtin function " << name << std::endl;
+ std::cout << " found in " << p_kernel->function()->getName().str()
+ << '.' << std::endl;
+}
+
+/*
+ * Built-in functions
+ */
+
+static size_t get_global_id(cl_uint dimindx)
+{
+ return g_work_group->getGlobalId(dimindx);
+}
+
+static cl_uint get_work_dim()
+{
+ return g_work_group->getWorkDim();
+}
+
+static size_t get_global_size(uint dimindx)
+{
+ return g_work_group->getGlobalSize(dimindx);
+}
+
+static size_t get_local_size(uint dimindx)
+{
+ return g_work_group->getLocalSize(dimindx);
+}
+
+static size_t get_local_id(uint dimindx)
+{
+ return g_work_group->getLocalID(dimindx);
+}
+
+static size_t get_num_groups(uint dimindx)
+{
+ return g_work_group->getNumGroups(dimindx);
+}
+
+static size_t get_group_id(uint dimindx)
+{
+ return g_work_group->getGroupID(dimindx);
+}
+
+static size_t get_global_offset(uint dimindx)
+{
+ return g_work_group->getGlobalOffset(dimindx);
+}
+
+static void barrier(unsigned int flags)
+{
+ g_work_group->barrier(flags);
+}
+
+// Images
+
+static int get_image_width(Image2D *image)
+{
+ return image->width();
+}
+
+static int get_image_height(Image2D *image)
+{
+ return image->height();
+}
+
+static int get_image_depth(Image3D *image)
+{
+ if (image->type() != MemObject::Image3D)
+ return 1;
+
+ return image->depth();
+}
+
+static int get_image_channel_data_type(Image2D *image)
+{
+ return image->format().image_channel_data_type;
+}
+
+static int get_image_channel_order(Image2D *image)
+{
+ return image->format().image_channel_order;
+}
+
+static void *image_data(Image2D *image, int x, int y, int z, int *order, int *type)
+{
+ *order = image->format().image_channel_order;
+ *type = image->format().image_channel_data_type;
+
+ return g_work_group->getImageData(image, x, y, z);
+}
+
+static bool is_image_3d(Image3D *image)
+{
+ return (image->type() == MemObject::Image3D ? 1 : 0);
+}
+
+static void write_imagef(Image2D *image, int x, int y, int z, float *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void write_imagei(Image2D *image, int x, int y, int z, int32_t *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void write_imageui(Image2D *image, int x, int y, int z, uint32_t *color)
+{
+ g_work_group->writeImage(image, x, y, z, color);
+}
+
+static void read_imagefi(float *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageii(int32_t *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z,
+ int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageff(float *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageif(int32_t *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageuif(uint32_t *result, Image2D *image, float x, float y,
+ float z, int32_t sampler)
+{
+ g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+/* Dummy function to plug missing ARM ABI EH fxns: */
+static void dummy_fxn(void)
+{
+}
+
+
+/*
+ * Bridge between LLVM and us
+ */
+static void unimplemented_stub()
+{
+}
+
+void *getBuiltin(const std::string &name)
+{
+ if (name == "get_global_id")
+ return (void *)&get_global_id;
+ else if (name == "get_work_dim")
+ return (void *)&get_work_dim;
+ else if (name == "get_global_size")
+ return (void *)&get_global_size;
+ else if (name == "get_local_size")
+ return (void *)&get_local_size;
+ else if (name == "get_local_id")
+ return (void *)&get_local_id;
+ else if (name == "get_num_groups")
+ return (void *)&get_num_groups;
+ else if (name == "get_group_id")
+ return (void *)&get_group_id;
+ else if (name == "get_global_offset")
+ return (void *)&get_global_offset;
+ else if (name == "barrier")
+ return (void *)&barrier;
+
+ else if (name == "__cpu_get_image_width")
+ return (void *)&get_image_width;
+ else if (name == "__cpu_get_image_height")
+ return (void *)&get_image_height;
+ else if (name == "__cpu_get_image_depth")
+ return (void *)&get_image_depth;
+ else if (name == "__cpu_get_image_channel_data_type")
+ return (void *)&get_image_channel_data_type;
+ else if (name == "__cpu_get_image_channel_order")
+ return (void *)&get_image_channel_order;
+ else if (name == "__cpu_image_data")
+ return (void *)&image_data;
+ else if (name == "__cpu_is_image_3d")
+ return (void *)&is_image_3d;
+ else if (name == "__cpu_write_imagef")
+ return (void *)&write_imagef;
+ else if (name == "__cpu_write_imagei")
+ return (void *)&write_imagei;
+ else if (name == "__cpu_write_imageui")
+ return (void *)&write_imageui;
+ else if (name == "__cpu_read_imagefi")
+ return (void *)&read_imagefi;
+ else if (name == "__cpu_read_imageii")
+ return (void *)&read_imageii;
+ else if (name == "__cpu_read_imageuii")
+ return (void *)&read_imageuii;
+ else if (name == "__cpu_read_imageff")
+ return (void *)&read_imageff;
+ else if (name == "__cpu_read_imageif")
+ return (void *)&read_imageif;
+ else if (name == "__cpu_read_imageuif")
+ return (void *)&read_imageuif;
+
+ else if (name == "debug")
+ return (void *)&printf;
+ else if (name == "__aeabi_unwind_cpp_pr0")
+ return (void *)&dummy_fxn;
+ else if (name == "__aeabi_unwind_cpp_pr1")
+ return (void *)&dummy_fxn;
+ else if (name == "__aeabi_unwind_cpp_pr2")
+ return (void *)&dummy_fxn;
+
+ // Math library disambiguation for OpenCL double functions of the same name.
+ else if (name == "builtin_sincos")
+ return (void *)&sincos;
+ else if (name == "builtin_lgamma_r")
+ return (void *)&lgamma_r;
+ else if (name == "builtin_modf")
+ return (void *)&modf;
+ else if (name == "builtin_remquo")
+ return (void *)&remquo;
+ else if (name == "builtin_pow")
+ return (void *)&pow;
+ else if (name == "builtin_exp10f")
+ return (void *)&exp10f;
+ else if (name == "builtin_exp10")
+ return (void *)&exp10;
+
+#if 0
+ // Other misc functions Khronos tests say are builtins, though not in the spec!
+ else if (name == "memcpy")
+ return (void *)&memcpy;
+#endif
+
+ // Function not found
+ g_work_group->builtinNotFound(name);
+
+ return (void *)&unimplemented_stub;
+}
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
new file mode 100644
index 0000000..69143ea
--- /dev/null
+++ b/src/core/cpu/builtins.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file builtins.h
+ * \brief CPU built-in functions
+ */
+#ifndef __BUILTINS_H__
+#define __BUILTINS_H__
+
+#include <string>
+
+namespace Coal {
+ class CPUKernelWorkGroup;
+}
+
+/**
+ * \brief Set the current kernel work-group of this thread
+ * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group.
+ */
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+
+/**
+ * \brief Return the address of a built-in function given its name
+ * \param name name of the built-in whose address is requested
+ */
+void *getBuiltin(const std::string &name);
+
+/**
+ * \brief Work-item stacks
+ * \see \ref barrier
+ * \param size size of the allocated space for stacks
+ * \return address of the allocated space for stacks
+ */
+void *getWorkItemsData(size_t &size);
+
+/**
+ * \brief Set work-item stacks
+ * \see \ref barrier
+ * \param ptr address of allocated space for stacks
+ * \param size size of the allocated space for stacks
+ */
+void setWorkItemsData(void *ptr, size_t size);
+
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ * element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ * maximum value and couldn't be further incremented.
+ */
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+ bool overflow = false;
+
+ for (unsigned int i=0; i<dims; ++i)
+ {
+ vec[i] += 1;
+
+ if (vec[i] > maxs[i])
+ {
+ vec[i] = 0;
+ overflow = true;
+ }
+ else
+ {
+ overflow = false;
+ break;
+ }
+ }
+
+ return overflow;
+}
+
+/**
+ * \brief Address of a pixel in an image
+ *
+ * This function is heavily used when Clover needs to address a pixel or a byte
+ * in a rectangular or three-dimensional image or buffer.
+ *
+ * \param base address of the first pixel in the image (address of the image itself)
+ * \param x X coordinate, cannot be bigger or equal to \c width
+ * \param y Y coordinate, cannot be bigger or equal to \c height
+ * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays)
+ * \param row_pitch size in bytes of a row of pixels in the image
+ * \param slice_pitch size in bytes of a slice in a 3D array
+ * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when
+ * coordinates are in pixels and not in bytes.
+ */
+unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
+ size_t row_pitch, size_t slice_pitch,
+ unsigned int bytes_per_pixel);
+
+#endif
+
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp
new file mode 100644
index 0000000..eb3fcb1
--- /dev/null
+++ b/src/core/cpu/device.cpp
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/device.cpp
+ * \brief CPU Device
+ */
+
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "program.h"
+#include "worker.h"
+#include "builtins.h"
+
+#include <core/config.h>
+#include "../propertylist.h"
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+#include "../program.h"
+#include "../util.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+using namespace Coal;
+
+#if !(defined(DSPC868X) || defined(SHAMROCK_BUILD))
+#include "../dsp/shmem.h"
+// unsigned arm_speed();
+#endif
+
+#define ONE_GIGABYTE (1 << 30)
+
+CPUDevice::CPUDevice()
+: DeviceInterface(), p_cores(0), p_num_events(0), p_workers(0), p_stop(false),
+ p_initialized(false)
+{
+ // Get info about the system
+ p_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ p_cpu_mhz = 0.0f;
+
+ std::filebuf fb;
+ fb.open("/proc/cpuinfo", std::ios::in);
+ std::istream is(&fb);
+
+ while (!is.eof())
+ {
+ std::string key, value;
+
+ std::getline(is, key, ':');
+ is.ignore(1);
+ std::getline(is, value);
+
+ if (key.compare(0, 7, "cpu MHz") == 0)
+ {
+ std::istringstream ss(value);
+ ss >> p_cpu_mhz;
+ }
+
+ if (key.compare(0, 10, "model name") == 0)
+ p_device_name = value;
+
+ if (key.compare(0, 9, "Processor") == 0)
+ p_device_name = value;
+ }
+
+ if (p_cpu_mhz == 0.0f)
+ {
+ std::string file("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq");
+ std::ifstream fs(file.c_str());
+ if (fs) { fs >> p_cpu_mhz; p_cpu_mhz /= 1000; }
+ }
+
+ if (p_cpu_mhz == 0.0f) p_cpu_mhz = 1000.0;
+
+#if !defined(DSPC868X)
+ // p_cpu_mhz = arm_speed();
+#endif
+}
+
+
+void CPUDevice::init()
+{
+ if (p_initialized) return;
+
+ // Initialize the locking machinery
+ pthread_cond_init(&p_events_cond, 0);
+ pthread_mutex_init(&p_events_mutex, 0);
+
+ // Create worker threads
+ p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t));
+
+ for (unsigned int i=0; i<numCPUs(); ++i)
+ {
+ pthread_create(&p_workers[i], 0, &worker, this);
+ }
+
+ p_initialized = true;
+}
+
+CPUDevice::~CPUDevice()
+{
+ if (!p_initialized)
+ return;
+
+ // Terminate the workers and wait for them
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_stop = true;
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+
+ for (unsigned int i=0; i<numCPUs(); ++i)
+ {
+ pthread_join(p_workers[i], 0);
+ }
+
+ // Free allocated memory
+ std::free((void *)p_workers);
+ pthread_mutex_destroy(&p_events_mutex);
+ pthread_cond_destroy(&p_events_cond);
+}
+
+DeviceBuffer *CPUDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs)
+{
+ return (DeviceBuffer *)new CPUBuffer(this, buffer, rs);
+}
+
+DeviceProgram *CPUDevice::createDeviceProgram(Program *program)
+{
+ return (DeviceProgram *)new CPUProgram(this, program);
+}
+
+DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel,
+ llvm::Function *function)
+{
+ return (DeviceKernel *)new CPUKernel(this, kernel, function);
+}
+
+cl_int CPUDevice::initEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::MapBuffer:
+ {
+ MapBufferEvent *e = (MapBufferEvent *)event;
+ CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(this);
+ unsigned char *data = (unsigned char *)buf->data();
+
+ data += e->offset();
+
+ e->setPtr((void *)data);
+ break;
+ }
+ case Event::MapImage:
+ {
+ MapImageEvent *e = (MapImageEvent *)event;
+ Image2D *image = (Image2D *)e->buffer();
+ CPUBuffer *buf = (CPUBuffer *)image->deviceBuffer(this);
+ unsigned char *data = (unsigned char *)buf->data();
+
+ data = imageData(data,
+ e->origin(0),
+ e->origin(1),
+ e->origin(2),
+ image->row_pitch(),
+ image->slice_pitch(),
+ image->pixel_size());
+
+ e->setPtr((void *)data);
+ e->setRowPitch(image->row_pitch());
+ e->setSlicePitch(image->slice_pitch());
+ break;
+ }
+ case Event::UnmapMemObject:
+ // Nothing do to
+ break;
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ // Instantiate the JIT for the CPU program
+ KernelEvent *e = (KernelEvent *)event;
+ Program *p = (Program *)e->kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)p->deviceDependentProgram(this);
+
+ if (!prog->initJIT())
+ return CL_INVALID_PROGRAM_EXECUTABLE;
+
+ // Set device-specific data
+ CPUKernelEvent *cpu_e = new CPUKernelEvent(this, e);
+ e->setDeviceData((void *)cpu_e);
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ return CL_SUCCESS;
+}
+
+void CPUDevice::freeEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ CPUKernelEvent *cpu_e = (CPUKernelEvent *)event->deviceData();
+
+ if (cpu_e)
+ delete cpu_e;
+ }
+ default:
+ break;
+ }
+}
+
+void CPUDevice::pushEvent(Event *event)
+{
+ // Add an event in the list
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_events.push_back(event);
+ p_num_events++; // Way faster than STL list::size() !
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+}
+
+Event *CPUDevice::getEvent(bool &stop)
+{
+ // Return the first event in the list, if any. Remove it if it is a
+ // single-shot event.
+ pthread_mutex_lock(&p_events_mutex);
+
+ while (p_num_events == 0 && !p_stop)
+ pthread_cond_wait(&p_events_cond, &p_events_mutex);
+
+ if (p_stop)
+ {
+ pthread_mutex_unlock(&p_events_mutex);
+ stop = true;
+ return 0;
+ }
+
+ Event *event = p_events.front();
+
+ // If the run of this event will finish it, remove it from the list
+ bool last_slot = true;
+
+ if (event->type() == Event::NDRangeKernel ||
+ event->type() == Event::TaskKernel)
+ {
+ CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData();
+ last_slot = ke->reserve();
+ }
+
+ if (last_slot)
+ {
+ p_num_events--;
+ p_events.pop_front();
+ }
+
+ pthread_mutex_unlock(&p_events_mutex);
+
+ return event;
+}
+
+/******************************************************************************
+* Device's decision about whether CommandQueue should push more events over
+* This number could be tuned (e.g. using ooo example). Note that p_num_events
+* are in device's queue, but not yet executed.
+******************************************************************************/
+bool CPUDevice::gotEnoughToWorkOn()
+{
+ return p_num_events > 0;
+}
+
+unsigned int CPUDevice::numCPUs() const
+{
+ return p_cores;
+}
+
+float CPUDevice::cpuMhz() const
+{
+ return p_cpu_mhz;
+}
+
+// From inner parentheses to outher ones :
+//
+// sizeof * 8 => 8
+// -1 => 7
+// 1 << $ => 10000000
+// -1 => 01111111
+// *2 => 11111110
+// +1 => 11111111
+//
+// A simple way to do this is (1 << (sizeof(type) * 8)) - 1, but it overflows
+// the type (for int8, 1 << $ = 100000000 = 256 > 255)
+#define TYPE_MAX(type) ((((type)1 << ((sizeof(type) * 8) - 1)) - 1) * 2 + 1)
+
+cl_int CPUDevice::info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_device_type cl_device_type_var;
+ cl_uint cl_uint_var;
+ size_t size_t_var;
+ cl_ulong cl_ulong_var;
+ cl_bool cl_bool_var;
+ cl_device_fp_config cl_device_fp_config_var;
+ cl_device_mem_cache_type cl_device_mem_cache_type_var;
+ cl_device_local_mem_type cl_device_local_mem_type_var;
+ cl_device_exec_capabilities cl_device_exec_capabilities_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ cl_platform_id cl_platform_id_var;
+ size_t work_dims[MAX_WORK_DIMS];
+ };
+
+ switch (param_name)
+ {
+ case CL_DEVICE_TYPE:
+ SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_CPU);
+ break;
+
+ case CL_DEVICE_VENDOR_ID:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ SIMPLE_ASSIGN(cl_uint, numCPUs());
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS);
+ break;
+
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, ONE_GIGABYTE);
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ for (int i=0; i<MAX_WORK_DIMS; ++i)
+ {
+ work_dims[i] = ONE_GIGABYTE;
+ }
+ value_length = MAX_WORK_DIMS * sizeof(size_t);
+ value = &work_dims;
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ SIMPLE_ASSIGN(cl_uint, cpuMhz());
+ break;
+
+ case CL_DEVICE_ADDRESS_BITS:
+ SIMPLE_ASSIGN(cl_uint, 8*sizeof(void *));
+ break;
+
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ SIMPLE_ASSIGN(size_t, 65536);
+ break;
+
+ case CL_DEVICE_MAX_SAMPLERS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ SIMPLE_ASSIGN(cl_uint, 1024 /* sizeof(long16)*8) */); // 128 byte
+ break;
+
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ // TODO: Check what an x86 SSE engine can support.
+ // Currently not supporting CL_FP_DENORM
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST);
+ break;
+
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ // These are minimally required to be supported by the OCL spec:
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
+ CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ SIMPLE_ASSIGN(cl_device_mem_cache_type,
+ CL_READ_WRITE_CACHE);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ // TODO: Get this information from the processor
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ // TODO: Get this information from the processor
+ SIMPLE_ASSIGN(cl_ulong, 512*1024*1024);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ // parse /proc/meminfo to get the value
+ SIMPLE_ASSIGN(cl_ulong, parse_file_line_value("/proc/meminfo",
+ "MemTotal:", 512*1024) * 1024);
+ break;
+
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ // TODO: 1 Gio seems to be enough for software acceleration
+
+#if defined(__arm__)
+ SIMPLE_ASSIGN(cl_ulong, 512*1024*1024);
+#else
+ SIMPLE_ASSIGN(cl_ulong, 1*1024*1024*1024);
+#endif
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 65536);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ SIMPLE_ASSIGN(cl_device_local_mem_type, CL_GLOBAL);
+ break;
+
+
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ // TODO
+ SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 ms
+ break;
+
+ case CL_DEVICE_ENDIAN_LITTLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL |
+ CL_EXEC_NATIVE_KERNEL);
+ break;
+
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE);
+ break;
+
+ case CL_DEVICE_NAME:
+ value_length = p_device_name.size() + 1;
+ value = const_cast<char*>(p_device_name.c_str());
+ break;
+
+ case CL_DEVICE_VENDOR:
+ STRING_ASSIGN("Generic");
+ break;
+
+ case CL_DRIVER_VERSION:
+ STRING_ASSIGN("" COAL_VERSION);
+ break;
+
+ case CL_DEVICE_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_DEVICE_VERSION:
+ STRING_ASSIGN("OpenCL 1.1 " COAL_VERSION);
+ break;
+
+ case CL_DEVICE_EXTENSIONS:
+ STRING_ASSIGN("cl_khr_global_int32_base_atomics"
+ " cl_khr_global_int32_extended_atomics"
+ " cl_khr_local_int32_base_atomics"
+ " cl_khr_local_int32_extended_atomics"
+ " cl_khr_byte_addressable_store"
+
+ " cl_khr_fp64"
+ " cl_khr_int64_base_atomics"
+ " cl_khr_int64_extended_atomics")
+
+ break;
+
+ case CL_DEVICE_PLATFORM:
+ SIMPLE_ASSIGN(cl_platform_id, 0);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 16);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_OPENCL_C_VERSION:
+ STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+#if !defined(DSPC868X)
+#if 0 // /dev/mem is no longer available
+unsigned arm_speed()
+{
+ //return 1000.0;
+ const unsigned TETRIS_PLL = 125000000;
+ const unsigned pagesize = 0x1000;
+
+ shmem_persistent page;
+ page.configure(0x02620000, pagesize);
+ char *host_msmc = (char*)page.map(0x02620000, pagesize);
+ unsigned SECPLLCTL0 = *(unsigned*)(host_msmc + 0x370);
+ unsigned prediv = 1 + (SECPLLCTL0 & 0x3F);
+ unsigned mult = 1 + ((SECPLLCTL0 >> 6) & 0x1FFF);
+ unsigned output_div = 1 + ((SECPLLCTL0 >> 19) & 0xF);
+ unsigned speed = TETRIS_PLL * mult / prediv / output_div;
+ page.unmap(host_msmc, pagesize);
+
+ return speed / 1000000;
+}
+#endif
+#endif
+
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
new file mode 100644
index 0000000..a0ad6ef
--- /dev/null
+++ b/src/core/cpu/device.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/device.h
+ * \brief CPU device
+ */
+
+#ifndef __CPU_DEVICE_H__
+#define __CPU_DEVICE_H__
+
+#include "../deviceinterface.h"
+
+#include <pthread.h>
+#include <list>
+#include <string>
+
+namespace Coal
+{
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+/**
+ * \brief CPU device
+ *
+ * This class is the base of all the CPU-accelerated OpenCL processing. It
+ * creates and manages subclasses such as \c Coal::DeviceBuffer,
+ * \c Coal::DeviceProgram and \c Coal::DeviceKernel.
+ *
+ * This class and the aforementioned ones work together to compile and run
+ * kernels using the LLVM JIT, manage buffers, provide built-in functions
+ * and do all of this in a multithreaded fashion using worker threads.
+ *
+ * \see \ref events
+ */
+class CPUDevice : public DeviceInterface
+{
+ public:
+ CPUDevice();
+ ~CPUDevice();
+
+ /**
+ * \brief Initialize the CPU device
+ *
+ * This function creates the worker threads and get information about
+ * the host system for the \c numCPUs() and \c cpuMhz functions.
+ */
+ void init();
+
+ cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
+ DeviceProgram *createDeviceProgram(Program *program);
+ DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function);
+
+ cl_int initEventDeviceData(Event *event);
+ void freeEventDeviceData(Event *event);
+
+ void pushEvent(Event *event);
+ Event *getEvent(bool &stop);
+ bool gotEnoughToWorkOn();
+
+ unsigned int numCPUs() const; /*!< \brief Number of logical CPU cores on the system */
+ float cpuMhz() const; /*!< \brief Speed of the CPU in Mhz */
+
+ std::string builtinsHeader(void) const { return "cpu.h"; }
+
+ private:
+ unsigned int p_cores, p_num_events;
+ float p_cpu_mhz;
+ std::string p_device_name;
+ pthread_t *p_workers;
+
+ std::list<Event *> p_events;
+ pthread_cond_t p_events_cond;
+ pthread_mutex_t p_events_mutex;
+ bool p_stop, p_initialized;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
new file mode 100644
index 0000000..ef09f6b
--- /dev/null
+++ b/src/core/cpu/kernel.cpp
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/kernel.cpp
+ * \brief CPU kernel
+ */
+
+#include "kernel.h"
+#include "device.h"
+#include "buffer.h"
+#include "program.h"
+#include "builtins.h"
+
+#include "../kernel.h"
+#include "../memobject.h"
+#include "../events.h"
+#include "../program.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <sys/mman.h>
+
+using namespace Coal;
+
+CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
+: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
+ p_call_function(0)
+{
+ pthread_mutex_init(&p_call_function_mutex, 0);
+
+ const char *fn_name;
+
+ // If we can reuse the same function between work groups, do it
+/* tag out for now if (p_call_function)
+ {
+ llvm::Function *rs = p_call_function;
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return rs;
+ } */
+
+ /* Create a stub function in the form of
+ *
+ * void stub(void *args) {
+ * kernel(*(int *)((char *)args + 0),
+ * *(float **)((char *)args + sizeof(int)),
+ * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
+ * }
+ *
+ * In LLVM, it is exprimed in the form of :
+ *
+ * @stub(i8* args) {
+ * kernel(
+ * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
+ * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
+ * ...
+ * );
+ * }
+ */
+ fn_name = kernel->p_name.c_str();
+ Program *p = (Program *)kernel->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device));
+ //llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name);
+
+ char * s_name = (char *) malloc(strlen(fn_name)+6);
+ sprintf(s_name,"_stub%s",fn_name);
+
+ llvm::FunctionType *kernel_function_type = function->getFunctionType();
+ llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
+ function->getReturnType(),
+ llvm::Type::getInt8PtrTy(
+ function->getContext()),
+ false);
+ llvm::Function *stub_function = llvm::Function::Create(
+ stub_function_type,
+ llvm::Function::InternalLinkage,
+ s_name,
+ function->getParent());
+
+ // Insert a basic block
+ llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
+ function->getContext(),
+ "",
+ stub_function);
+
+ // Create the function arguments
+ llvm::Argument &stub_arg = stub_function->getArgumentList().front();
+ llvm::SmallVector<llvm::Value *, 8> args;
+ size_t args_offset = 0;
+
+ for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
+ {
+ llvm::Type *param_type = kernel_function_type->getParamType(i);
+ llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
+ const Kernel::Arg *arg = p_kernel->arg(i);
+
+ // Calculate the size of the arg
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ // Get where to place this argument
+ size_t arg_offset = typeOffset(args_offset, arg_size);
+
+ // %1 = getelementptr(args, $arg_offset);
+ llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
+ &stub_arg,
+ llvm::ConstantInt::get(stub_function->getContext(),
+ llvm::APInt(64, arg_offset)),
+ "",
+ basic_block);
+
+ // %2 = bitcast(%1, $param_type_ptr)
+ llvm::Value *bitcast = new llvm::BitCastInst(
+ getelementptr,
+ param_type_ptr,
+ "",
+ basic_block);
+
+ // %3 = load(%2)
+ llvm::Value *load = new llvm::LoadInst(
+ bitcast,
+ "",
+ false,
+ arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps
+ basic_block);
+
+ // We have the value, send it to the function
+ args.push_back(load);
+ }
+
+ // Create the call instruction
+ llvm::CallInst *call_inst = llvm::CallInst::Create(
+ function,
+ args,
+ "",
+ basic_block);
+ call_inst->setCallingConv(function->getCallingConv());
+ call_inst->setTailCall();
+
+ // Create a return instruction to end the stub
+ llvm::ReturnInst::Create(
+ function->getContext(),
+ basic_block);
+
+ // Retain the function if it can be reused
+ p_call_function = stub_function;
+
+}
+
+CPUKernel::~CPUKernel()
+{
+ if (p_call_function)
+ p_call_function->eraseFromParent();
+
+ pthread_mutex_destroy(&p_call_function_mutex);
+}
+
+size_t CPUKernel::workGroupSize()
+{
+ // Just use CL_DEVICE_MAX_WORK_GROUP_SIZE
+ size_t param_value;
+ size_t param_value_size_ret;
+
+ p_device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+ &param_value, &param_value_size_ret);
+
+ return param_value;
+}
+
+cl_ulong CPUKernel::localMemSize() const
+{
+ return 0; // TODO
+}
+
+cl_ulong CPUKernel::privateMemSize() const
+{
+ return 0; // TODO
+}
+
+size_t CPUKernel::preferredWorkGroupSizeMultiple() const
+{
+ unsigned int cpus = p_device->numCPUs();
+ return cpus;
+}
+
+template<typename T>
+T k_exp(T base, unsigned int e)
+{
+ T rs = base;
+
+ for (unsigned int i=1; i<e; ++i)
+ rs *= base;
+
+ return rs;
+}
+
+// Try to find the size a work group has to have to be executed the fastest on
+// the CPU.
+size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const
+{
+ unsigned int cpus = p_device->numCPUs();
+
+ // Don't break in too small parts
+ if (k_exp(global_work_size, num_dims) > 64)
+ return global_work_size;
+
+ // Find the divisor of global_work_size the closest to cpus but >= than it
+ unsigned int divisor = cpus;
+
+ while (true)
+ {
+ if ((global_work_size % divisor) == 0)
+ break;
+
+ // Don't let the loop go up to global_work_size, the overhead would be
+ // too huge
+ if (divisor > global_work_size || divisor > cpus * 32)
+ {
+ divisor = 1; // Not parallel but has no CommandQueue overhead
+ break;
+ }
+ }
+
+ // Return the size
+ return global_work_size / divisor;
+}
+
+llvm::Function *CPUKernel::function() const
+{
+ return p_function;
+}
+
+Kernel *CPUKernel::kernel() const
+{
+ return p_kernel;
+}
+
+CPUDevice *CPUKernel::device() const
+{
+ return p_device;
+}
+
+// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
+template <class T>
+T next_power_of_two(T k) {
+ if (k == 0)
+ return 1;
+ k--;
+ for (int i=1; i<sizeof(T)*8; i<<=1)
+ k = k | k >> i;
+ return k+1;
+}
+
+size_t CPUKernel::typeOffset(size_t &offset, size_t type_len)
+{
+ size_t rs = offset;
+
+ // Align offset to stype_len
+ type_len = next_power_of_two(type_len);
+ size_t mask = ~(type_len - 1);
+
+ while (rs & mask != rs)
+ rs++;
+
+ // Where to try to place the next value
+ offset = rs + type_len;
+
+ return rs;
+}
+
+llvm::Function *CPUKernel::callFunction()
+{
+ const char *fn_name;
+ pthread_mutex_lock(&p_call_function_mutex);
+
+ // If we can reuse the same function between work groups, do it
+ if (p_call_function)
+ {
+ llvm::Function *rs = p_call_function;
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return rs;
+ }
+
+ /* Create a stub function in the form of
+ *
+ * void stub(void *args) {
+ * kernel(*(int *)((char *)args + 0),
+ * *(float **)((char *)args + sizeof(int)),
+ * *(sampler_t *)((char *)args + sizeof(int) + sizeof(float *)));
+ * }
+ *
+ * In LLVM, it is exprimed in the form of :
+ *
+ * @stub(i8* args) {
+ * kernel(
+ * load(i32* bitcast(i8* getelementptr(i8* args, i64 0), i32*)),
+ * load(float** bitcast(i8* getelementptr(i8* args, i64 4), float**)),
+ * ...
+ * );
+ * }
+ */
+ fn_name = kernel()->p_name.c_str();
+ Program *p = (Program *)kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(device()));
+ llvm::Function *t_function = prog->jit()->FindFunctionNamed(fn_name);
+
+
+ llvm::FunctionType *kernel_function_type = p_function->getFunctionType();
+ llvm::FunctionType *stub_function_type = llvm::FunctionType::get(
+ p_function->getReturnType(),
+ llvm::Type::getInt8PtrTy(
+ p_function->getContext()),
+ false);
+ llvm::Function *stub_function = llvm::Function::Create(
+ stub_function_type,
+ llvm::Function::InternalLinkage,
+ "stub",
+ p_function->getParent());
+
+ // Insert a basic block
+ llvm::BasicBlock *basic_block = llvm::BasicBlock::Create(
+ p_function->getContext(),
+ "",
+ stub_function);
+
+ // Create the function arguments
+ llvm::Argument &stub_arg = stub_function->getArgumentList().front();
+ llvm::SmallVector<llvm::Value *, 8> args;
+ size_t args_offset = 0;
+
+ for (unsigned int i=0; i<kernel_function_type->getNumParams(); ++i)
+ {
+ llvm::Type *param_type = kernel_function_type->getParamType(i);
+ llvm::Type *param_type_ptr = param_type->getPointerTo(); // We'll use pointers to the value
+ const Kernel::Arg *arg = p_kernel->arg(i);
+
+ // Calculate the size of the arg
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ // Get where to place this argument
+ size_t arg_offset = typeOffset(args_offset, arg_size);
+
+ // %1 = getelementptr(args, $arg_offset);
+ llvm::Value *getelementptr = llvm::GetElementPtrInst::CreateInBounds(
+ &stub_arg,
+ llvm::ConstantInt::get(stub_function->getContext(),
+ llvm::APInt(64, arg_offset)),
+ "",
+ basic_block);
+
+ // %2 = bitcast(%1, $param_type_ptr)
+ llvm::Value *bitcast = new llvm::BitCastInst(
+ getelementptr,
+ param_type_ptr,
+ "",
+ basic_block);
+
+ // %3 = load(%2)
+ llvm::Value *load = new llvm::LoadInst(
+ bitcast,
+ "",
+ false,
+ arg_size, // We ensure that an argument is always aligned on its size, it enables things like fast movaps
+ basic_block);
+
+ // We have the value, send it to the function
+ args.push_back(load);
+ }
+
+ // Create the call instruction
+ llvm::CallInst *call_inst = llvm::CallInst::Create(
+ t_function,
+ args,
+ "",
+ basic_block);
+ call_inst->setCallingConv(p_function->getCallingConv());
+ call_inst->setTailCall();
+
+ // Create a return instruction to end the stub
+ llvm::ReturnInst::Create(
+ p_function->getContext(),
+ basic_block);
+
+ // Retain the function if it can be reused
+ p_call_function = stub_function;
+
+ pthread_mutex_unlock(&p_call_function_mutex);
+
+ return stub_function;
+}
+
+/*
+ * CPUKernelEvent
+ */
+CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
+: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0),
+ p_kernel_args(0)
+{
+ // Mutex
+ pthread_mutex_init(&p_mutex, 0);
+
+ // Set current work group to (0, 0, ..., 0)
+ std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));
+
+ // Populate p_max_work_groups
+ p_num_wg = 1;
+
+ for (cl_uint i=0; i<event->work_dim(); ++i)
+ {
+ p_max_work_groups[i] =
+ (event->global_work_size(i) / event->local_work_size(i)) - 1; // 0..n-1, not 1..n
+
+ p_num_wg *= p_max_work_groups[i] + 1;
+ }
+}
+
+CPUKernelEvent::~CPUKernelEvent()
+{
+ pthread_mutex_destroy(&p_mutex);
+
+ if (p_kernel_args)
+ std::free(p_kernel_args);
+}
+
+bool CPUKernelEvent::reserve()
+{
+ // Lock, this will be unlocked in takeInstance()
+ pthread_mutex_lock(&p_mutex);
+
+ // Last work group if current == max - 1
+ return (p_current_wg == p_num_wg - 1);
+}
+
+bool CPUKernelEvent::finished()
+{
+ bool rs;
+
+ pthread_mutex_lock(&p_mutex);
+
+ rs = (p_finished_wg == p_num_wg);
+
+ pthread_mutex_unlock(&p_mutex);
+
+ return rs;
+}
+
+void CPUKernelEvent::workGroupFinished()
+{
+ pthread_mutex_lock(&p_mutex);
+
+ p_finished_wg++;
+
+ pthread_mutex_unlock(&p_mutex);
+}
+
+CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
+{
+ CPUKernelWorkGroup *wg = new CPUKernelWorkGroup((CPUKernel *)p_event->deviceKernel(),
+ p_event,
+ this,
+ p_current_work_group);
+
+ // Increment current work group
+ incVec(p_event->work_dim(), p_current_work_group, p_max_work_groups);
+ p_current_wg += 1;
+
+ // Release event
+ pthread_mutex_unlock(&p_mutex);
+
+ return wg;
+}
+
+void *CPUKernelEvent::kernelArgs() const
+{
+ return p_kernel_args;
+}
+
+void CPUKernelEvent::cacheKernelArgs(void *args)
+{
+ p_kernel_args = args;
+}
+
+/*
+ * CPUKernelWorkGroup
+ */
+CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
+ CPUKernelEvent *cpu_event,
+ const size_t *work_group_index)
+: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
+ p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
+ p_had_barrier(false)
+{
+
+ // Set index
+ std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
+
+ // Set maxs and global id
+ p_num_work_items = 1;
+
+ for (unsigned int i=0; i<p_work_dim; ++i)
+ {
+ p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+ p_num_work_items *= event->local_work_size(i);
+
+ // Set global id
+ p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
+ + event->global_work_offset(i);
+ }
+}
+
+CPUKernelWorkGroup::~CPUKernelWorkGroup()
+{
+ p_cpu_event->workGroupFinished();
+}
+
+void *CPUKernelWorkGroup::callArgs(std::vector<void *> &locals_to_free)
+{
+ if (p_cpu_event->kernelArgs() && !p_kernel->kernel()->hasLocals())
+ {
+ // We have cached the args and can reuse them
+ return p_cpu_event->kernelArgs();
+ }
+
+ // We need to create them from scratch
+ void *rs;
+
+ size_t args_size = 0;
+
+ for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg *arg = p_kernel->kernel()->arg(i);
+ CPUKernel::typeOffset(args_size, arg->valueSize() * arg->vecDim());
+ }
+
+ rs = std::malloc(args_size);
+
+ if (!rs)
+ return NULL;
+
+ size_t arg_offset = 0;
+
+ for (unsigned int i=0; i<p_kernel->kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg *arg = p_kernel->kernel()->arg(i);
+ size_t size = arg->valueSize() * arg->vecDim();
+ size_t offset = CPUKernel::typeOffset(arg_offset, size);
+
+ // Where to place the argument
+ unsigned char *target = (unsigned char *)rs;
+ target += offset;
+
+ // We may have to perform some changes in the values (buffers, etc)
+ switch (arg->kind())
+ {
+ case Kernel::Arg::Buffer:
+ {
+ MemObject *buffer = *(MemObject **)arg->data();
+
+ if (arg->file() == Kernel::Arg::Local)
+ {
+ // Alloc a buffer and pass it to the kernel
+ void *local_buffer = std::malloc(arg->allocAtKernelRuntime());
+ locals_to_free.push_back(local_buffer);
+ *(void **)target = local_buffer;
+ }
+ else
+ {
+ if (!buffer)
+ {
+ // We can do that, just send NULL
+ *(void **)target = NULL;
+ }
+ else
+ {
+ // Get the CPU buffer, allocate it and get its pointer
+ CPUBuffer *cpubuf =
+ (CPUBuffer *)buffer->deviceBuffer(p_kernel->device());
+ void *buf_ptr = 0;
+
+ buffer->allocate(p_kernel->device());
+ buf_ptr = cpubuf->data();
+
+ *(void **)target = buf_ptr;
+ }
+ }
+
+ break;
+ }
+ case Kernel::Arg::Image2D:
+ case Kernel::Arg::Image3D:
+ {
+ // We need to ensure the image is allocated
+ Image2D *image = *(Image2D **)arg->data();
+ image->allocate(p_kernel->device());
+
+ // Fall through to the memcpy
+ }
+ default:
+ // Simply copy the arg's data into the buffer
+ std::memcpy(target, arg->data(), size);
+ break;
+ }
+ }
+
+ // Cache the arguments if we can do so
+ if (!p_kernel->kernel()->hasLocals())
+ p_cpu_event->cacheKernelArgs(rs);
+
+ return rs;
+}
+
+bool CPUKernelWorkGroup::run()
+{
+ // Get the kernel function to call
+ std::vector<void *> locals_to_free;
+ llvm::Function *kernel_func = p_kernel->callFunction();
+
+ if (!kernel_func)
+ return false;
+
+ Program *p = (Program *)p_kernel->kernel()->parent();
+ CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
+
+ // Make object usable for execution: (only applies to MCJIT):
+ prog->jit()->finalizeObject();
+
+ std::string kname = kernel_func->getName().str();
+
+ // original
+ p_kernel_func_addr =
+ (void(*)(void *))prog->jit()->getPointerToFunction(kernel_func);
+
+ // TAG
+ // llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->p_kernel->p_name->str());
+// llvm::Function *t_func = prog->jit()->FindFunctionNamed(p_kernel->kernel()->p_name.c_str());
+// p_kernel_func_addr = (void(*)(void *))prog->jit()->getPointerToFunction(t_func);
+ p_kernel_func_addr =(void(*)(void *)) prog->jit()->getFunctionAddress(kname);
+
+ // Get the arguments
+ p_args = callArgs(locals_to_free);
+
+ // Tell the builtins this thread will run a kernel work group
+ setThreadLocalWorkGroup(this);
+
+ // Initialize the dummy context used by the builtins before a call to barrier()
+ p_current_work_item = 0;
+ p_current_context = &p_dummy_context;
+
+ std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));
+
+ do
+ {
+ // Simply call the "call function", it and the builtins will do the rest
+ p_kernel_func_addr(p_args);
+ } while (!p_had_barrier &&
+ !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
+
+ // If no barrier() call was made, all is fine. If not, only the first
+ // work-item has currently finished. We must let the others run.
+ if (p_had_barrier)
+ {
+ Context *main_context = p_current_context; // After the first swapcontext,
+ // we will not be able to trust
+ // p_current_context anymore.
+
+ // We'll call swapcontext for each remaining work-item. They will
+ // finish, and when they'll do so, this main context will be resumed, so
+ // it's easy (i starts from 1 because the main context already finished)
+ for (unsigned int i=1; i<p_num_work_items; ++i)
+ {
+ Context *ctx = getContextAddr(i);
+ swapcontext(&main_context->context, &ctx->context);
+ }
+ }
+
+ // Free the allocated locals
+ if (p_kernel->kernel()->hasLocals())
+ {
+ for (size_t i=0; i<locals_to_free.size(); ++i)
+ {
+ std::free(locals_to_free[i]);
+ }
+
+ std::free(p_args);
+ }
+
+ return true;
+}
+
+CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
+{
+ size_t size;
+ char *data = (char *)p_contexts;
+
+ // Each Context in data is an element of size p_stack_size + sizeof(Context)
+ size = p_stack_size + sizeof(Context);
+ size *= index; // To get an offset
+
+ return (Context *)(data + size); // Pointer to the context
+}
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
new file mode 100644
index 0000000..ab4d1ac
--- /dev/null
+++ b/src/core/cpu/kernel.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/kernel.h
+ * \brief CPU kernel
+ */
+
+#ifndef __CPU_KERNEL_H__
+#define __CPU_KERNEL_H__
+
+#include "../deviceinterface.h"
+#include <core/config.h>
+
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <vector>
+#include <string>
+
+#include <ucontext.h>
+#include <pthread.h>
+#include <stdint.h>
+
+namespace llvm
+{
+ class Function;
+}
+
+namespace Coal
+{
+
+class CPUDevice;
+class Kernel;
+class KernelEvent;
+class Image2D;
+class Image3D;
+
+/**
+ * \brief CPU kernel
+ *
+ * This class holds passive information about a kernel (\c Coal::Kernel object
+ * and device on which it is run) and provides the \c callFunction() function.
+ *
+ * This function is described at the end of \ref llvm .
+ *
+ * \see Coal::CPUKernelWorkGroup
+ */
+class CPUKernel : public DeviceKernel
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device device on which the kernel will be run
+ * \param kernel \c Coal::Kernel object holding information about this
+ * kernel
+ * \param function \c llvm::Function to run
+ */
+ CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
+ ~CPUKernel();
+
+ size_t workGroupSize();
+ cl_ulong localMemSize() const;
+ cl_ulong privateMemSize() const;
+ size_t preferredWorkGroupSizeMultiple() const;
+ size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const;
+
+ Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */
+ CPUDevice *device() const; /*!< \brief device on which the kernel will be run */
+
+ llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
+ llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */
+
+ /**
+ * \brief Calculate where to place a value in an array
+ *
+ * This function is used to calculate where to place a value in an
+ * array given its size, properly aligning it.
+ *
+ * This function is called repeatedly to obtain the aligned position of
+ * each value that must be place in the array
+ *
+ * \code
+ * size_t array_len = 0, array_offset = 0;
+ * void *array;
+ *
+ * // First, get the array size given alignment constraints
+ * typeOffset(array_len, sizeof(int));
+ * typeOffset(array_len, sizeof(float));
+ * typeOffset(array_len, sizeof(void *));
+ *
+ * // Then, allocate memory
+ * array = malloc(array_len)
+ *
+ * // Finally, place the arguments
+ * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
+ * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
+ * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
+ * \endcode
+ *
+ * \param offset offset at which the value will be placed. This variable
+ * gets incremented by <tt>type_len + padding</tt>.
+ * \param type_len size in bytes of the value that will be stored
+ * \return offset at which the value will be stored (equal to \p offset
+ * before incrementation.
+ */
+ static size_t typeOffset(size_t &offset, size_t type_len);
+
+ private:
+ CPUDevice *p_device;
+ Kernel *p_kernel;
+ llvm::Function *p_function, *p_call_function;
+ pthread_mutex_t p_call_function_mutex;
+};
+
+class CPUKernelEvent;
+
+/**
+ * \brief CPU kernel work-group
+ *
+ * This class represent a bulk of work-items that will be run. It is the one
+ * to actually run the kernel of its elements.
+ *
+ * \see \ref llvm
+ * \nosubgrouping
+ */
+class CPUKernelWorkGroup
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param kernel kernel to run
+ * \param event event containing information about the kernel run
+ * \param cpu_event CPU-specific information and cache about \p event
+ * \param work_group_index index of this work-group in the kernel
+ */
+ CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
+ CPUKernelEvent *cpu_event,
+ const size_t *work_group_index);
+ ~CPUKernelWorkGroup();
+
+ /**
+ * \brief Build a structure of arguments
+ *
+ * As C doesn't support calling functions with variable arguments
+ * unknown at the compilation, this function builds the list of
+ * arguments in memory. This array will then be passed to a LLVM stub
+ * function reading it and passing its values to the actuel kernel.
+ *
+ * \see \ref llvm
+ * \param locals_to_free if this kernel takes \c __local arguments, they
+ * must be \c malloc()'ed for every work-group.
+ * They are placed in this vector to be
+ * \c free()'ed at the end of \c run().
+ * \return address of a memory location containing the arguments
+ */
+ void *callArgs(std::vector<void *> &locals_to_free);
+
+ /**
+ * \brief Run the work-group
+ *
+ * This function is the core of CPU-acceleration. It runs the work-items
+ * of this work-group given the correct arguments.
+ *
+ * \see \ref llvm
+ * \see \ref barrier
+ * \see callArgs()
+ * \return true if success, false in case of an error
+ */
+ bool run();
+
+ /**
+ * \name Native implementation of built-in OpenCL C functions
+ * @{
+ */
+ size_t getGlobalId(cl_uint dimindx) const;
+ cl_uint getWorkDim() const;
+ size_t getGlobalSize(cl_uint dimindx) const;
+ size_t getLocalSize(cl_uint dimindx) const;
+ size_t getLocalID(cl_uint dimindx) const;
+ size_t getNumGroups(cl_uint dimindx) const;
+ size_t getGroupID(cl_uint dimindx) const;
+ size_t getGlobalOffset(cl_uint dimindx) const;
+
+ void barrier(unsigned int flags);
+
+ void *getImageData(Image2D *image, int x, int y, int z) const;
+
+ void writeImage(Image2D *image, int x, int y, int z, float *color) const;
+ void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const;
+ void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const;
+
+ void readImage(float *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ void readImage(int32_t *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ void readImage(uint32_t *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+
+ void readImage(float *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ void readImage(int32_t *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ /**
+ * @}
+ */
+
+ /**
+ * \brief Function called when a built-in name cannot be found
+ */
+ void builtinNotFound(const std::string &name) const;
+
+ private:
+ template<typename T>
+ void writeImageImpl(Image2D *image, int x, int y, int z, T *color) const;
+ template<typename T>
+ void readImageImplI(T *result, Image2D *image, int x, int y, int z,
+ uint32_t sampler) const;
+ template<typename T>
+ void readImageImplF(T *result, Image2D *image, float x, float y, float z,
+ uint32_t sampler) const;
+ template<typename T>
+ void linear3D(T *result, float a, float b, float c,
+ int i0, int j0, int k0, int i1, int j1, int k1,
+ Image3D *image) const;
+ template<typename T>
+ void linear2D(T *result, float a, float b, float c, int i0, int j0,
+ int i1, int j1, Image2D *image) const;
+
+ private:
+ CPUKernel *p_kernel;
+ CPUKernelEvent *p_cpu_event;
+ KernelEvent *p_event;
+ cl_uint p_work_dim;
+ size_t p_index[MAX_WORK_DIMS],
+ p_max_local_id[MAX_WORK_DIMS],
+ p_global_id_start_offset[MAX_WORK_DIMS];
+
+ void (*p_kernel_func_addr)(void *);
+ void *p_args;
+
+ // Machinery to have barrier() working
+ struct Context
+ {
+ size_t local_id[MAX_WORK_DIMS];
+ ucontext_t context;
+ unsigned int initialized;
+ };
+
+ Context *getContextAddr(unsigned int index);
+
+ Context *p_current_context;
+ Context p_dummy_context;
+ void *p_contexts;
+ size_t p_stack_size;
+ unsigned int p_num_work_items, p_current_work_item;
+ bool p_had_barrier;
+};
+
+/**
+ * \brief CPU-specific information about a kernel event
+ *
+ * This class put in a \c Coal::KernelEvent device-data field
+ * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
+ * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
+ */
+class CPUKernelEvent
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device device running the kernel
+ * \param event \c Coal::KernelEvent holding device-agnostic data
+ * about the event
+ */
+ CPUKernelEvent(CPUDevice *device, KernelEvent *event);
+ ~CPUKernelEvent();
+
+ bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */
+ bool finished(); /*!< \brief All the work groups have finished */
+ CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
+
+ void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */
+ void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */
+
+ void workGroupFinished(); /*!< \brief A work-group has just finished */
+
+ private:
+ CPUDevice *p_device;
+ KernelEvent *p_event;
+ size_t p_current_work_group[MAX_WORK_DIMS],
+ p_max_work_groups[MAX_WORK_DIMS];
+ size_t p_current_wg, p_finished_wg, p_num_wg;
+ pthread_mutex_t p_mutex;
+ void *p_kernel_args;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp
new file mode 100644
index 0000000..7eb632c
--- /dev/null
+++ b/src/core/cpu/program.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/program.cpp
+ * \brief CPU program
+ */
+
+#include "program.h"
+#include "device.h"
+#include "kernel.h"
+#include "builtins.h"
+
+#include "../program.h"
+
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/Support/ErrorHandling.h>
+
+#include <string>
+#include <iostream>
+
+using namespace Coal;
+using namespace llvm;
+
+// Create a custom memory manager for MCJIT
+class ClientMemoryManager : public SectionMemoryManager
+{
+ ClientMemoryManager(const ClientMemoryManager&) LLVM_DELETED_FUNCTION;
+ void operator=(const ClientMemoryManager&) LLVM_DELETED_FUNCTION;
+
+public:
+ ClientMemoryManager() {}
+ virtual ~ClientMemoryManager() {}
+
+ /// This method returns the (host) address of the specified function.
+ virtual uint64_t getSymbolAddress(const std::string &Name);
+};
+
+uint64_t ClientMemoryManager::getSymbolAddress(const std::string &Name)
+{
+ // Try the standard symbol resolution first, but ask it not to abort.
+ uint64_t addr = RTDyldMemoryManager::getSymbolAddress(Name);
+ if (!addr) {
+ addr = (uint64_t)getBuiltin(Name);
+ }
+
+ if (!addr)
+ report_fatal_error("OpenCL program references external function '" + Name +
+ "' which could not be resolved!");
+ return addr;
+}
+
+CPUProgram::CPUProgram(CPUDevice *device, Program *program)
+: DeviceProgram(), p_device(device), p_program(program), p_jit(0)
+{
+
+}
+
+CPUProgram::~CPUProgram()
+{
+ if (p_jit)
+ {
+ // Dont delete the module
+ p_jit->removeModule(p_module);
+
+ delete p_jit;
+ }
+}
+
+bool CPUProgram::linkStdLib() const
+{
+ return true;
+}
+
+void CPUProgram::createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier)
+{
+ if (optimize)
+ {
+ /*
+ * Inspired by code from "The LLVM Compiler Infrastructure"
+ */
+ manager->add(llvm::createDeadArgEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createFunctionInliningPass());
+ manager->add(llvm::createPruneEHPass()); // Remove dead EH info.
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createGlobalDCEPass()); // Remove dead functions.
+ manager->add(llvm::createArgumentPromotionPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createScalarReplAggregatesPass());
+ manager->add(llvm::createFunctionAttrsPass()); // Add nocapture.
+ manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis.
+ manager->add(llvm::createLICMPass()); // Hoist loop invariants.
+ manager->add(llvm::createGVNPass()); // Remove redundancies.
+ manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys.
+ manager->add(llvm::createDeadStoreEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createCFGSimplificationPass());
+ }
+}
+
+bool CPUProgram::build(llvm::Module *module, std::string *binary_str)
+{
+ // Nothing to build
+ p_module = module;
+
+ return true;
+}
+
+bool CPUProgram::initJIT()
+{
+ if (p_jit)
+ return true;
+
+ if (!p_module)
+ return false;
+
+ // Create the JIT
+ std::string err;
+
+ p_jit = llvm::EngineBuilder(p_module)
+ .setErrorStr(&err)
+ .setUseMCJIT(true)
+ .setMCJITMemoryManager(new ClientMemoryManager())
+ .create();
+
+ if (!p_jit)
+ {
+ std::cout << "Unable to create a JIT: " << err << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+llvm::ExecutionEngine *CPUProgram::jit() const
+{
+ return p_jit;
+}
diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h
new file mode 100644
index 0000000..0a08d61
--- /dev/null
+++ b/src/core/cpu/program.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/program.h
+ * \brief CPU program
+ */
+
+#ifndef __CPU_PROGRAM_H__
+#define __CPU_PROGRAM_H__
+
+#include "../deviceinterface.h"
+
+namespace llvm
+{
+ class ExecutionEngine;
+ class Module;
+}
+
+namespace Coal
+{
+
+class CPUDevice;
+class Program;
+
+/**
+ * \brief CPU program
+ *
+ * This class implements the \c Coal::DeviceProgram interface for CPU
+ * acceleration.
+ *
+ * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode,
+ * in \c initJIT().
+ */
+class CPUProgram : public DeviceProgram
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param device CPU device to which this program is attached
+ * \param program \c Coal::Program that will be run
+ */
+ CPUProgram(CPUDevice *device, Program *program);
+ ~CPUProgram();
+
+ bool linkStdLib() const;
+ void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false);
+ bool build(llvm::Module *module, std::string *binary_str);
+
+ /**
+ * \brief Initialize an LLVM JIT
+ *
+ * This function creates a \c llvm::JIT object to run this program on
+ * the CPU. A few implementation details :
+ *
+ * - The JIT is set not to resolve unknown symbols using \c dlsym().
+ * This way, a malicious kernel cannot execute arbitrary code on
+ * the host by declaring \c libc functions and calling them.
+ * - All the unknown function names are passed to \c getBuiltin() to
+ * get native built-in implementations.
+ *
+ * \return true if success, false otherwise
+ */
+ bool initJIT();
+ llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */
+
+ private:
+ CPUDevice *p_device;
+ Program *p_program;
+
+ llvm::ExecutionEngine *p_jit;
+ llvm::Module *p_module;
+};
+
+}
+
+#endif
diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp
new file mode 100644
index 0000000..893e66e
--- /dev/null
+++ b/src/core/cpu/sampler.cpp
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/sampler.cpp
+ * \brief OpenCL C image access functions
+ *
+ * It is recommended to compile this file using Clang as it supports the
+ * \c __builtin_shufflevector() built-in function, providing SSE or
+ * NEON-accelerated code.
+ */
+
+#include "../memobject.h"
+#include "../sampler.h"
+#include "kernel.h"
+#include "buffer.h"
+#include "builtins.h"
+
+#include <cstdlib>
+#include <cmath>
+// ASW #include <immintrin.h>
+
+using namespace Coal;
+
+/*
+ * Helper functions
+ */
+
+static int clamp(int a, int b, int c)
+{
+ return (a < b) ? b : ((a > c) ? c : a);
+}
+
+static int min(int a, int b)
+{
+ return (a < b ? a : b);
+}
+
+static int max(int a, int b)
+{
+ return (a > b ? a : b);
+}
+
+static float frac(float x)
+{
+ return x - std::floor(x);
+}
+
+static float round(float x)
+{
+ return (float)(int)x;
+}
+
+static bool handle_address_mode(Image2D *image, int &x, int &y, int &z,
+ uint32_t sampler)
+{
+ bool is_3d = (image->type() == MemObject::Image3D);
+ int w = image->width(),
+ h = image->height(),
+ d = (is_3d ? ((Image3D *)image)->depth() : 1);
+
+ if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP_TO_EDGE)
+ {
+ x = clamp(x, 0, w - 1);
+ y = clamp(y, 0, h - 1);
+ if (is_3d) z = clamp(z, 0, d - 1);
+ }
+ else if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP)
+ {
+ x = clamp(x, 0, w);
+ y = clamp(y, 0, h);
+ if (is_3d) z = clamp(z, 0, d);
+ }
+
+ return (x == w || y == h || z == d);
+}
+
+/*
+ * Macros or functions used to accelerate the functions
+ */
+#ifndef __has_builtin
+ #define __has_builtin(x) 0
+#endif
+
+static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b,
+ int x, int y, int z, int w)
+{
+ rs[0] = (x < 4 ? a[x] : b[x - 4]);
+ rs[1] = (y < 4 ? a[y] : b[y - 4]);
+ rs[2] = (z < 4 ? a[z] : b[z - 4]);
+ rs[3] = (w < 4 ? a[w] : b[w - 4]);
+}
+
+static void convert_to_format(void *dest, float *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_FLOAT)
+ std::memcpy(dest, data, channels * sizeof(float));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SNORM_INT8:
+ ((int8_t *)dest)[i] = data[i] * 128.0f;
+ break;
+ case CL_SNORM_INT16:
+ ((int16_t *)dest)[i] = data[i] * 32767.0f;
+ break;
+ case CL_UNORM_INT8:
+ ((uint8_t *)dest)[i] = data[i] * 255.0f;
+ break;
+ case CL_UNORM_INT16:
+ ((uint16_t *)dest)[i] = data[i] * 65535.0f;
+ break;
+ }
+ }
+}
+
+static void convert_from_format(float *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_FLOAT)
+ std::memcpy(data, source, channels * sizeof(float));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SNORM_INT8:
+ data[i] = (float)((int8_t *)source)[i] / 127.0f;
+ break;
+ case CL_SNORM_INT16:
+ data[i] = (float)((int16_t *)source)[i] / 32767.0f;
+ break;
+ case CL_UNORM_INT8:
+ data[i] = (float)((uint8_t *)source)[i] / 127.0f;
+ break;
+ case CL_UNORM_INT16:
+ data[i] = (float)((uint16_t *)source)[i] / 127.0f;
+ break;
+ }
+ }
+}
+
+static void convert_to_format(void *dest, int *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_SIGNED_INT32)
+ std::memcpy(dest, data, channels * sizeof(int32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SIGNED_INT8:
+ ((int8_t *)dest)[i] = data[i];
+ break;
+ case CL_SIGNED_INT16:
+ ((int16_t *)dest)[i] = data[i];
+ break;
+ }
+ }
+}
+
+static void convert_from_format(int32_t *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_SIGNED_INT32)
+ std::memcpy(data, source, channels * sizeof(int32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_SIGNED_INT8:
+ data[i] = ((int8_t *)source)[i];
+ break;
+ case CL_SIGNED_INT16:
+ data[i] = ((int16_t *)source)[i];
+ break;
+ }
+ }
+}
+
+static void convert_to_format(void *dest, uint32_t *data,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_UNSIGNED_INT32)
+ std::memcpy(dest, data, channels * sizeof(uint32_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ switch (type)
+ {
+ case CL_UNSIGNED_INT8:
+ ((uint8_t *)dest)[i] = data[i];
+ break;
+ case CL_UNSIGNED_INT16:
+ ((uint16_t *)dest)[i] = data[i];
+ break;
+ }
+ }
+}
+
+static void convert_from_format(uint32_t *data, void *source,
+ cl_channel_type type, unsigned int channels)
+{
+ // Convert always the four components of source to target
+ if (type == CL_UNSIGNED_INT32)
+ std::memcpy(data, source, channels * sizeof(uint32_t));
+
+ for (unsigned int i=0; i<channels; ++i)
+ {
+ switch (type)
+ {
+ case CL_UNSIGNED_INT8:
+ data[i] = ((uint8_t *)source)[i];
+ break;
+ case CL_UNSIGNED_INT16:
+ data[i] = ((uint16_t *)source)[i];
+ break;
+ }
+ }
+}
+
+template<typename T>
+static void vec4_scalar_mul(T *vec, float val)
+{
+ for (unsigned int i=0; i<4; ++i)
+ vec[i] *= val;
+}
+
+template<typename T>
+static void vec4_add(T *vec1, T *vec2)
+{
+ for (unsigned int i=0; i<4; ++i)
+ vec1[i] += vec2[i];
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear3D(T *result, float a, float b, float c,
+ int i0, int j0, int k0, int i1, int j1, int k1,
+ Image3D *image) const
+{
+ T accum[4];
+
+ readImageImplI<T>(result, image, i0, j0, k0, 0);
+ vec4_scalar_mul(result, (1.0f - a) * (1.0f - b) * (1.0f - c ));
+
+ readImageImplI<T>(accum, image, i1, j0, k0, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b) * (1.0f - c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, k0, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b * (1.0f - c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, k0, 0);
+ vec4_scalar_mul(accum, a * b * (1.0f -c ));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j0, k1, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * (1.0f - b) * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j0, k1, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b) * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, k1, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b * c);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, k1, 0);
+ vec4_scalar_mul(accum, a * b * c);
+ vec4_add(result, accum);
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear2D(T *result, float a, float b, float c, int i0, int j0,
+ int i1, int j1, Image2D *image) const
+{
+ T accum[4];
+
+ readImageImplI<T>(result, image, i0, j0, 0, 0);
+ vec4_scalar_mul(result, (1.0f - a) * (1.0f - b));
+
+ readImageImplI<T>(accum, image, i1, j0, 0, 0);
+ vec4_scalar_mul(accum, a * (1.0f - b));
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i0, j1, 0, 0);
+ vec4_scalar_mul(accum, (1.0f - a) * b);
+ vec4_add(result, accum);
+
+ readImageImplI<T>(accum, image, i1, j1, 0, 0);
+ vec4_scalar_mul(accum, a * b);
+ vec4_add(result, accum);
+}
+
+#if __has_builtin(__builtin_shufflevector)
+ #define shuffle4(rs, a, b, x, y, z, w) \
+ *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \
+ x, y, z, w)
+#else
+ #define shuffle4(rs, a, b, x, y, z, w) \
+ slow_shuffle4(rs, a, b, x, y, z, w)
+#endif
+
+static void swizzle(uint32_t *target, uint32_t *source,
+ cl_channel_order order, bool reading, uint32_t t_max)
+{
+ uint32_t special[4] = {0, t_max, 0, 0 };
+
+ if (reading)
+ {
+ switch (order)
+ {
+ case CL_R:
+ case CL_Rx:
+ // target = {source->x, 0, 0, t_max}
+ shuffle4(target, source, special, 0, 4, 4, 5);
+ break;
+ case CL_A:
+ // target = {0, 0, 0, source->x}
+ shuffle4(target, source, special, 4, 4, 4, 0);
+ break;
+ case CL_INTENSITY:
+ // target = {source->x, source->x, source->x, source->x}
+ shuffle4(target, source, source, 0, 0, 0, 0);
+ break;
+ case CL_LUMINANCE:
+ // target = {source->x, source->x, source->x, t_max}
+ shuffle4(target, source, special, 0, 0, 0, 5);
+ break;
+ case CL_RG:
+ case CL_RGx:
+ // target = {source->x, source->y, 0, t_max}
+ shuffle4(target, source, special, 0, 1, 4, 5);
+ break;
+ case CL_RA:
+ // target = {source->x, 0, 0, source->y}
+ shuffle4(target, source, special, 0, 4, 4, 1);
+ break;
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ // Nothing to do, already the good order
+ std::memcpy(target, source, 16);
+ break;
+ case CL_ARGB:
+ // target = {source->y, source->z, source->w, source->x}
+ shuffle4(target, source, source, 1, 2, 3, 0);
+ break;
+ case CL_BGRA:
+ // target = {source->z, source->y, source->x, source->w}
+ shuffle4(target, source, source, 2, 1, 0, 3);
+ break;
+ }
+ }
+ else
+ {
+ switch (order)
+ {
+ case CL_A:
+ // target = {source->w, undef, undef, undef}
+ shuffle4(target, source, source, 3, 3, 3, 3);
+ break;
+ case CL_RA:
+ // target = {source->x, source->w, undef, undef}
+ shuffle4(target, source, source, 0, 3, 3, 3);
+ break;
+ case CL_ARGB:
+ // target = {source->w, source->x, source->y, source->z}
+ shuffle4(target, source, source, 3, 0, 1, 2);
+ break;
+ case CL_BGRA:
+ // target = {source->z, source->y, source->x, source->w}
+ shuffle4(target, source, source, 2, 1, 0, 3);
+ break;
+ default:
+ std::memcpy(target, source, 16);
+ }
+ }
+}
+
+/*
+ * Actual implementation of the built-ins
+ */
+
+void *CPUKernelWorkGroup::getImageData(Image2D *image, int x, int y, int z) const
+{
+ CPUBuffer *buffer =
+ (CPUBuffer *)image->deviceBuffer((DeviceInterface *)p_kernel->device());
+
+ return imageData((unsigned char *)buffer->data(),
+ x, y, z,
+ image->row_pitch(),
+ image->slice_pitch(),
+ image->pixel_size());
+}
+
+template<typename T>
+void CPUKernelWorkGroup::writeImageImpl(Image2D *image, int x, int y, int z,
+ T *color) const
+{
+ T converted[4];
+
+ // Swizzle to the correct order (float, int and uint are 32-bit, so the
+ // type has no importance
+ swizzle((uint32_t *)converted, (uint32_t *)color,
+ image->format().image_channel_order, false, 0);
+
+ // Get a pointer in the image where to write the data
+ void *target = getImageData(image, x, y, z);
+
+ // Convert color to the correct format
+ convert_to_format(target,
+ converted,
+ image->format().image_channel_data_type,
+ image->channels());
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ float *color) const
+{
+ writeImageImpl<float>(image, x, y, z, color);
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ int32_t *color) const
+{
+ writeImageImpl<int32_t>(image, x, y, z, color);
+}
+
+void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z,
+ uint32_t *color) const
+{
+ writeImageImpl<uint32_t>(image, x, y, z, color);
+}
+
+template<typename T>
+uint32_t type_max_value()
+{
+ return 0;
+}
+
+template<>
+uint32_t type_max_value<float>()
+{
+ return 1065353216; // 1.0f in decimal form
+}
+
+template<>
+uint32_t type_max_value<int32_t>()
+{
+ return 0x7fffffff;
+}
+
+template<>
+uint32_t type_max_value<uint32_t>()
+{
+ return 0xffffffff;
+}
+
+template<typename T>
+void CPUKernelWorkGroup::readImageImplI(T *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ // Handle the addressing mode of the sampler
+ if (handle_address_mode(image, x, y, z, sampler))
+ {
+ // Border color
+ result[0] = 0.0f;
+ result[1] = 0.0f;
+ result[2] = 0.0f;
+
+ switch (image->format().image_channel_order)
+ {
+ case CL_R:
+ case CL_RG:
+ case CL_RGB:
+ case CL_LUMINANCE:
+ result[3] = 1.0f;
+ break;
+ default:
+ result[3] = 0.0f;
+ }
+
+ return;
+ }
+
+ // Load the data from the image, converting it
+ void *source = getImageData(image, x, y, z);
+ T converted[4];
+
+ convert_from_format(converted,
+ source,
+ image->format().image_channel_data_type,
+ image->channels());
+
+ // Swizzle the pixel just read and place it in result
+ swizzle((uint32_t *)result, (uint32_t *)converted,
+ image->format().image_channel_order, true, type_max_value<T>());
+}
+
+void CPUKernelWorkGroup::readImage(float *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<float>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<int32_t>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, int x, int y,
+ int z, uint32_t sampler) const
+{
+ readImageImplI<uint32_t>(result, image, x, y, z, sampler);
+}
+
+template<typename T>
+void CPUKernelWorkGroup::readImageImplF(T *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ bool is_3d = (image->type() == MemObject::Image3D);
+ Image3D *image3d = (Image3D *)image;
+
+ int w = image->width(),
+ h = image->height(),
+ d = (is_3d ? image3d->depth() : 1);
+
+ switch (sampler & 0xf0)
+ {
+ case CLK_ADDRESS_NONE:
+ case CLK_ADDRESS_CLAMP:
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ /* De-normalize coordinates */
+ if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE)
+ {
+ x *= (float)w;
+ y *= (float)h;
+ if (is_3d) z *= (float)d;
+ }
+
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ readImageImplI<T>(result, image, std::floor(x),
+ std::floor(y), std::floor(z), sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c,
+ std::floor(x - 0.5f),
+ std::floor(y - 0.5f),
+ std::floor(z - 0.5f),
+ std::floor(x - 0.5f) + 1,
+ std::floor(y - 0.5f) + 1,
+ std::floor(z - 0.5f) + 1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c,
+ std::floor(x - 0.5f),
+ std::floor(y - 0.5f),
+ std::floor(x - 0.5f) + 1,
+ std::floor(y - 0.5f) + 1,
+ image);
+ }
+ }
+ }
+ break;
+ case CLK_ADDRESS_REPEAT:
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ int i, j, k;
+
+ x = (x - std::floor(x)) * (float)w;
+ i = std::floor(x);
+ if (i > w - 1)
+ i = i - w;
+
+ y = (y - std::floor(y)) * (float)h;
+ j = std::floor(y);
+ if (j > h - 1)
+ j = j - h;
+
+ if (is_3d)
+ {
+ z = (z - std::floor(z)) * (float)d;
+ k = std::floor(z);
+ if (k > d - 1)
+ k = k - d;
+ }
+
+ readImageImplI<T>(result, image, i, j, k, sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+ int i0, i1, j0, j1, k0, k1;
+
+ x = (x - std::floor(x)) * (float)w;
+ i0 = std::floor(x - 0.5f);
+ i1 = i0 + 1;
+ if (i0 < 0)
+ i0 = w + i0;
+ if (i1 > w - 1)
+ i1 = i1 - w;
+
+ y = (y - std::floor(y)) * (float)h;
+ j0 = std::floor(y - 0.5f);
+ j1 = j0 + 1;
+ if (j0 < 0)
+ j0 = h + j0;
+ if (j1 > h - 1)
+ j1 = j1 - h;
+
+ if (is_3d)
+ {
+ z = (z - std::floor(z)) * (float)d;
+ k0 = std::floor(z - 0.5f);
+ k1 = k0 + 1;
+ if (k0 < 0)
+ k0 = d + k0;
+ if (k1 > d - 1)
+ k1 = k1 - d;
+ }
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+ }
+ }
+ }
+ break;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ switch (sampler & 0xf00)
+ {
+ case CLK_FILTER_NEAREST:
+ {
+ x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+ y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+ if (is_3d)
+ z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+
+ readImageImplI<T>(result, image,
+ min(std::floor(x), w - 1),
+ min(std::floor(y), h - 1),
+ min(std::floor(z), d - 1),
+ sampler);
+ }
+ case CLK_FILTER_LINEAR:
+ {
+ float a, b, c;
+ int i0, i1, j0, j1, k0, k1;
+
+ x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+ i0 = std::floor(x - 0.5f);
+ i1 = i0 + 1;
+ i0 = max(i0, 0);
+ i1 = min(i1, w - 1);
+
+ y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+ j0 = std::floor(y - 0.5f);
+ j1 = j0 + 1;
+ j0 = max(j0, 0);
+ j1 = min(j1, h - 1);
+
+ if (is_3d)
+ {
+ z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+ k0 = std::floor(z - 0.5f);
+ k1 = k0 + 1;
+ k0 = max(k0, 0);
+ k1 = min(k1, d - 1);
+ }
+
+ a = frac(x - 0.5f);
+ b = frac(y - 0.5f);
+ c = frac(z - 0.5f);
+
+ if (is_3d)
+ {
+ linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+ image3d);
+ }
+ else
+ {
+ linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+ }
+ }
+ }
+ break;
+ }
+}
+
+void CPUKernelWorkGroup::readImage(float *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<float>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<int32_t>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, float x,
+ float y, float z, uint32_t sampler) const
+{
+ readImageImplF<uint32_t>(result, image, x, y, z, sampler);
+}
diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp
new file mode 100644
index 0000000..e5251f2
--- /dev/null
+++ b/src/core/cpu/worker.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file cpu/worker.cpp
+ * \brief Code running in the worker threads launched by \c Coal::CPUDevice
+ * \sa builtins.cpp
+ */
+
+#include "worker.h"
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "builtins.h"
+
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+
+#include <sys/mman.h>
+
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+void *worker(void *data)
+{
+ CPUDevice *device = (CPUDevice *)data;
+ bool stop = false;
+ cl_int errcode;
+ Event *event;
+
+ // Initialize TLS
+ setWorkItemsData(0, 0);
+
+ while (true)
+ {
+ event = device->getEvent(stop);
+
+ // Ensure we have a good event and we don't have to stop
+ if (stop) break;
+ if (!event) continue;
+
+ // Get info about the event and its command queue
+ Event::Type t = event->type();
+ CommandQueue *queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ errcode = CL_SUCCESS;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Start);
+
+ // Execute the action
+ switch (t)
+ {
+ case Event::ReadBuffer:
+ case Event::WriteBuffer:
+ {
+ ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event;
+ CPUBuffer *buf = (CPUBuffer *)e->buffer()->deviceBuffer(device);
+ char *data = (char *)buf->data();
+
+ data += e->offset();
+
+ if (t == Event::ReadBuffer)
+ std::memcpy(e->ptr(), data, e->cb());
+ else std::memcpy(data, e->ptr(), e->cb());
+
+ break;
+ }
+ case Event::CopyBuffer:
+ {
+ CopyBufferEvent *e = (CopyBufferEvent *)event;
+ CPUBuffer *src = (CPUBuffer *)e->source()->deviceBuffer(device);
+ CPUBuffer *dst = (CPUBuffer *)e->destination()->deviceBuffer(device);
+
+ std::memcpy((char*)dst->data() + e->dst_offset(),
+ (char*)src->data() + e->src_offset(), e->cb());
+ break;
+ }
+ case Event::ReadBufferRect:
+ case Event::WriteBufferRect:
+ case Event::CopyBufferRect:
+ case Event::ReadImage:
+ case Event::WriteImage:
+ case Event::CopyImage:
+ case Event::CopyBufferToImage:
+ case Event::CopyImageToBuffer:
+ {
+ // src = buffer and dst = mem if note copy
+ ReadWriteCopyBufferRectEvent *e = (ReadWriteCopyBufferRectEvent *)event;
+ CPUBuffer *src_buf = (CPUBuffer *)e->source()->deviceBuffer(device);
+
+ unsigned char *src = (unsigned char *)src_buf->data();
+ unsigned char *dst;
+
+ switch (t)
+ {
+ case Event::CopyBufferRect:
+ case Event::CopyImage:
+ case Event::CopyImageToBuffer:
+ case Event::CopyBufferToImage:
+ {
+ CopyBufferRectEvent *cbre = (CopyBufferRectEvent *)e;
+ CPUBuffer *dst_buf =
+ (CPUBuffer *)cbre->destination()->deviceBuffer(device);
+
+ dst = (unsigned char *)dst_buf->data();
+ break;
+ }
+ default:
+ {
+ // dst = host memory location
+ ReadWriteBufferRectEvent *rwbre = (ReadWriteBufferRectEvent *)e;
+
+ dst = (unsigned char *)rwbre->ptr();
+ }
+ }
+
+ // Iterate over the lines to copy and use memcpy
+ for (size_t z=0; z<e->region(2); ++z)
+ {
+ for (size_t y=0; y<e->region(1); ++y)
+ {
+ unsigned char *s;
+ unsigned char *d;
+
+ d = imageData(dst,
+ e->dst_origin(0),
+ y + e->dst_origin(1),
+ z + e->dst_origin(2),
+ e->dst_row_pitch(),
+ e->dst_slice_pitch(),
+ 1);
+
+ s = imageData(src,
+ e->src_origin(0),
+ y + e->src_origin(1),
+ z + e->src_origin(2),
+ e->src_row_pitch(),
+ e->src_slice_pitch(),
+ 1);
+
+ // Copying and image to a buffer may need to add an offset
+ // to the buffer address (its rectangular origin is
+ // always (0, 0, 0)).
+ if (t == Event::CopyBufferToImage)
+ {
+ CopyBufferToImageEvent *cptie = (CopyBufferToImageEvent *)e;
+ s += cptie->offset();
+ }
+ else if (t == Event::CopyImageToBuffer)
+ {
+ CopyImageToBufferEvent *citbe = (CopyImageToBufferEvent *)e;
+ d += citbe->offset();
+ }
+
+ if (t == Event::WriteBufferRect || t == Event::WriteImage)
+ std::memcpy(s, d, e->region(0)); // Write dest (memory) in src
+ else
+ std::memcpy(d, s, e->region(0)); // Write src (buffer) in dest (memory), or copy the buffers
+ }
+ }
+
+ break;
+ }
+ case Event::MapBuffer:
+ case Event::MapImage:
+ // All was already done in CPUBuffer::initEventDeviceData()
+ break;
+
+ case Event::NativeKernel:
+ {
+ NativeKernelEvent *e = (NativeKernelEvent *)event;
+ void (*func)(void *) = (void (*)(void *))e->function();
+ void *args = e->args();
+
+ func(args);
+
+ break;
+ }
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *)event;
+ CPUKernelEvent *ke = (CPUKernelEvent *)e->deviceData();
+
+ // Take an instance
+ CPUKernelWorkGroup *instance = ke->takeInstance();
+ ke = 0; // Unlocked, don't use anymore
+
+ if (!instance->run())
+ errcode = CL_INVALID_PROGRAM_EXECUTABLE;
+
+ delete instance;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ // Cleanups
+ if (errcode == CL_SUCCESS)
+ {
+ bool finished = true;
+
+ if (event->type() == Event::NDRangeKernel ||
+ event->type() == Event::TaskKernel)
+ {
+ CPUKernelEvent *ke = (CPUKernelEvent *)event->deviceData();
+ finished = ke->finished();
+ }
+
+ if (finished)
+ {
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus(Event::Complete);
+ }
+ }
+ else
+ {
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ // The event failed
+ event->setStatus((Event::Status)errcode);
+ }
+ }
+
+ // Free mmapped() data if needed
+ size_t mapped_size;
+ void *mapped_data = getWorkItemsData(mapped_size);
+
+ if (mapped_data)
+ munmap(mapped_data, mapped_size);
+
+ return 0;
+}
diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h
new file mode 100644
index 0000000..43ddd03
--- /dev/null
+++ b/src/core/cpu/worker.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file worker.h
+ * \brief Function run by the CPU worker threads
+ */
+
+#ifndef __CPU_WORKER_H__
+#define __CPU_WORKER_H__
+
+/**
+ * \brief Main loop of the CPU worker threads
+ *
+ * This function is run by as many thread as they are CPU cores on the host
+ * system. As explained by \ref events , this function waits until there
+ * are \c Coal::Event objects to process and handle them.
+ */
+void *worker(void *data);
+
+#endif
diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h
new file mode 100644
index 0000000..a321a9e
--- /dev/null
+++ b/src/core/deviceinterface.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file deviceinterface.h
+ * \brief Abstraction layer between Clover core and the devices
+ */
+
+#ifndef __DEVICEINTERFACE_H__
+#define __DEVICEINTERFACE_H__
+
+#include <CL/cl.h>
+#include <string>
+#include "object.h"
+
+/* This pulls in legacy::PassManager when LLVM >= 3.4 */
+#include <llvm/PassManager.h>
+
+namespace Coal
+{
+
+class DeviceBuffer;
+class DeviceProgram;
+class DeviceKernel;
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+/**
+ * \brief Abstraction layer between core Clover objects and the devices
+ *
+ * This interface is used by the core Clover classes to communicate with the
+ * devices, that must reimplement all the functions described here.
+ */
+class DeviceInterface : public Object
+{
+ public:
+ DeviceInterface() : Object(Object::T_Device, 0) {}
+ virtual ~DeviceInterface() {}
+
+ /**
+ * \brief Retrieve information about the device
+ *
+ * This function is used to retrieve information about an object.
+ * Sometimes, the size of the data retrieved is unknown (for example, a
+ * string). The application can call this function twice, the first time
+ * to get the size, then it allocates a buffer, and finally get the data.
+ *
+ * \code
+ * const char *string = 0;
+ * size_t len;
+ *
+ * object->info(FOO_PROPERTY_STRING, 0, 0, &len);
+ * string = std::malloc(len);
+ * object->info(FOO_PROPERTY_STRING, len, string, 0);
+ * \endcode
+ *
+ * \param param_name Name of the property to retrieve
+ * \param param_value_size Size of the application-allocated buffer
+ * in which to put the value.
+ * \param param_value Pointer to an application-allocated buffer
+ * where the property data will be stored. Ignored
+ * if NULL.
+ * \param param_value_size_ret Size of the value retrieved, ignored if
+ * NULL.
+ * \return CL_SUCCESS in case of success, otherwise a CL error code.
+ */
+ virtual cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceBuffer object for this device
+ * \param buffer Memory object for which the buffer has to be created
+ * \param rs Error code (\c CL_SUCCESS if no error)
+ * \return a \c Coal::DeviceBuffer object, undefined if there is an error
+ */
+ virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceProgram object for this device
+ * \param program \c Coal::Program containing the device-independent
+ * program data
+ * \return a \c Coal::DeviceProgram object
+ */
+ virtual DeviceProgram *createDeviceProgram(Program *program) = 0;
+
+ /**
+ * \brief Create a \c Coal::DeviceKernel object for this device
+ * \param kernel \c Coal::Kernel containing the device-independent kernel
+ * data
+ * \param function device-specific \c llvm::Function to be used
+ * \return a \c Coal::DeviceKernel object
+ */
+ virtual DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function) = 0;
+
+ /**
+ * \brief Push an event on the device
+ * \sa the end of \ref events
+ * \param event the event to be pushed
+ */
+ virtual void pushEvent(Event *event) = 0;
+
+ /**
+ * \brief Initialize device-specific event data
+ *
+ * This call allows a device to initialize device-specific event data,
+ * by using \c Coal::Event::setDeviceData(). For instance, an
+ * hardware-accelerated device can associate a device command to an
+ * event, and use it to manage the event when it gets pushed.
+ *
+ * @note This function has one obligation: it must call
+ * \c Coal::MapBufferEvent::setPtr() and
+ * \c Coal::MapImageEvent::setPtr() (and other function described
+ * in its documentation)
+ *
+ * \param event the event for which data can be set
+ * \return CL_SUCCESS in case of success
+ */
+ virtual cl_int initEventDeviceData(Event *event) = 0;
+
+ /**
+ * \brief Free device-specific event data
+ *
+ * This function is called just before \p event gets deleted. It allows
+ * a device to free device-specific data of this event, if any.
+ *
+ * \param event the event that will be destroyed
+ */
+ virtual void freeEventDeviceData(Event *event) = 0;
+
+ virtual std::string builtinsHeader(void) const = 0;
+
+ virtual void init() = 0;
+
+ /**
+ * \brief Ask device if it has enough work in its queue
+ */
+ virtual bool gotEnoughToWorkOn() { return false; }
+};
+
+/**
+ * \brief Device-specific memory buffer
+ *
+ * This class is the backing-store used on a device for a \c Coal::MemObject. It
+ * is created by \c Coal::DeviceInterface::createDeviceBuffer().
+ */
+class DeviceBuffer
+{
+ public:
+ DeviceBuffer() {}
+ virtual ~DeviceBuffer() {}
+
+ /**
+ * \brief Allocate the buffer on the device
+ * \return true when success, false otherwise
+ */
+ virtual bool allocate() = 0;
+
+ /**
+ * \brief \c Coal::DeviceInterface of this buffer
+ * \return parent \c Coal::DeviceInterface
+ */
+ virtual DeviceInterface *device() const = 0;
+
+ /**
+ * \brief Allocation status
+ * \return true if already allocated, false otherwise
+ */
+ virtual bool allocated() const = 0;
+
+ /**
+ * \brief Host-accessible memory pointer
+ *
+ * This function returns what is passed as arguments to native kernels
+ * (\c clEnqueueNativeKernel(), \c Coal::NativeKernelEvent) in place of
+ * \c Coal::MemObject pointers.
+ *
+ * For \c Coal::CPUDevice, it's simply a pointer in RAM, but
+ * hardware-accelerated devices may need to do some copying or mapping.
+ *
+ * \warning Beware that this data may get written to by the native kernel.
+ *
+ * \return A memory pointer usable by a host native kernel
+ */
+ virtual void *nativeGlobalPointer() const = 0;
+};
+
+/**
+ * \brief Device-specific program data
+ */
+class DeviceProgram
+{
+ public:
+ DeviceProgram() {}
+ virtual ~DeviceProgram() {}
+
+ /**
+ * \brief Linking or not \b stdlib with this program
+ *
+ * \b stdlib is a LLVM bitcode file containing some implementations of
+ * OpenCL C built-ins. This function allows a device to tell
+ * \c Coal::Program::build() if it wants \b stdlib to be linked or not.
+ *
+ * Linking the library may allow inlining of functions like \c ceil(),
+ * \c floor(), \c clamp(), etc. So, if these functions are not better
+ * handled by the device itself than by \b stdlib, it's a good thing
+ * to link it.
+ *
+ * But if the device provides instructions for these functions, then
+ * it could be better not to link \b stdlib and to replace the LLVM
+ * calls to these functions with device-specific instructions.
+ *
+ * \warning \b Stdlib currently only works for \c Coal::CPUDevice, as
+ * it contains host-specific code (LLVM IR is not meant to be
+ * portable, pointer size changes for example).
+ *
+ * \return true if \b stdlib must be linked with the program
+ */
+ virtual bool linkStdLib() const = 0;
+
+ /**
+ * \brief Create device-specific optimization passes
+ *
+ * This hook allows a device to add LLVM optimization passes to a
+ * \c llvm::PassManager . This way, devices needing function flattening
+ * or special analysis passes can have them run on the mode.
+ *
+ * \param manager \c llvm::PassManager to which add the passes
+ * \param optimize false if \c -cl-opt-disable was given at compilation
+ * time.
+ */
+ virtual void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false) = 0;
+
+ /**
+ * \brief Build a device-specific representation of the program
+ *
+ * This function is called by \c Coal::Program::build() when the module
+ * is compiled and linked. It can be used by the device to build a
+ * device-specific representation of the program.
+ *
+ * \param module \c llvm::Module containing the program's LLVM IR
+ * \param binary_str \c std::string containing dep.unlinked_binary
+ * \return true in case of success, false otherwise
+ */
+ virtual bool build(llvm::Module *module, std::string* binary_str) = 0;
+
+ /**
+ * \brief Extract binaries from MIXED binary
+ *
+ * This function is called to extract LLVM bitcode from the native
+ * binary in the MIXED binary.
+ * \param binary_str \c std::string containing mixed binary
+ * \param bitcode \c std::string returns LLVM bitcode if not NULL
+ * \param native \c std::string returns native binary if not NULL
+ * \return true if the binary is indeed mixed
+ */
+ virtual bool ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native)
+ { return false; }
+};
+
+/**
+ * \brief Device-specific kernel data
+ */
+class DeviceKernel
+{
+ public:
+ DeviceKernel() {}
+ virtual ~DeviceKernel() {}
+
+ /**
+ * \brief Maximum work-group size of a kernel
+ * \return Maximum work-group size of the kernel based on device-specific
+ * data such as memory usage, register pressure, etc)
+ */
+ virtual size_t workGroupSize() = 0;
+
+ /**
+ * \brief Local memory used by the kernel
+ * \return Local memory used by the kernel, in bytes
+ */
+ virtual cl_ulong localMemSize() const = 0;
+
+ /**
+ * \brief Private memory used by the kernel
+ * \return Private memory used by the kernel, in bytes
+ */
+ virtual cl_ulong privateMemSize() const = 0;
+
+ /**
+ * \brief Preferred work-group size multiple
+ * \return The size multiple a work-group can have to work the best and
+ * the fastest on the device
+ */
+ virtual size_t preferredWorkGroupSizeMultiple() const = 0;
+
+ /**
+ * \brief Optimal work-group size
+ *
+ * This function allows a device to calculate the optimal work-group size
+ * for this kernel, using it's memory usage, SIMD dimension, etc.
+ *
+ * \c Coal::CPUDevice tries to split the kernel into a number of
+ * work-groups the closest possible to the number of CPU cores.
+ *
+ * \param num_dims Number of working dimensions
+ * \param dim Dimension for which the multiple is being calculated
+ * \param global_work_size Total number of work-items to split into
+ * work-groups
+ * \return optimal size of a work-group, for the \p dim dimension.
+ */
+ virtual size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const = 0;
+};
+
+}
+
+struct _cl_device_id : public Coal::DeviceInterface
+{};
+
+#endif
diff --git a/src/core/dsp/buffer.cpp b/src/core/dsp/buffer.cpp
new file mode 100644
index 0000000..72c5419
--- /dev/null
+++ b/src/core/dsp/buffer.cpp
@@ -0,0 +1,149 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "buffer.h"
+#include "device.h"
+#include "driver.h"
+
+#include "CL/cl_ext.h"
+#include "../memobject.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+DSPBuffer::DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs)
+ : DeviceBuffer(), p_device(device), p_buffer(buffer), p_data(0),
+ p_data_malloced(false), p_buffer_idx(0)
+{
+ if (buffer->type() != MemObject::SubBuffer &&
+ buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ /*---------------------------------------------------------------------
+ * We use the host ptr, we are already allocated
+ *--------------------------------------------------------------------*/
+ p_data = (DSPDevicePtr64)(uint64_t)buffer->host_ptr();
+ }
+}
+
+DSPBuffer::~DSPBuffer()
+{
+ if (p_data_malloced)
+ {
+ if (p_buffer->flags() & CL_MEM_USE_MSMC_TI)
+ p_device->free_msmc (p_data);
+ else p_device->free_global(p_data);
+ }
+}
+
+DSPDevicePtr64 DSPBuffer::data() const
+{
+ if (!p_data && p_buffer->type() == MemObject::SubBuffer)
+ {
+ /*---------------------------------------------------------------------
+ * Data is based on the DSPBuffer of the parent buffer
+ *--------------------------------------------------------------------*/
+ SubBuffer *subbuf = (SubBuffer *)p_buffer;
+ MemObject *parent = subbuf->parent();
+ DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device);
+
+ if (!parent_dspbuf->data()) parent_dspbuf->allocate();
+ if (!parent_dspbuf->data()) { return 0; } //ERROR()
+
+ return parent_dspbuf->data() + subbuf->offset();
+ }
+ else if (!p_data) ; // ERROR();
+
+ return p_data;
+}
+
+void *DSPBuffer::nativeGlobalPointer() const
+{
+ return (void*) (uint64_t) data();
+}
+
+bool DSPBuffer::allocate()
+{
+ size_t buf_size = p_buffer->size();
+
+ /*-------------------------------------------------------------------------
+ * Something went wrong...
+ *------------------------------------------------------------------------*/
+ if (buf_size == 0) return false;
+
+ if (!p_data && p_buffer->type() == MemObject::SubBuffer)
+ {
+ /*---------------------------------------------------------------------
+ * Data is based on the DSPBuffer of the parent buffer
+ *--------------------------------------------------------------------*/
+ SubBuffer *subbuf = (SubBuffer *)p_buffer;
+ MemObject *parent = subbuf->parent();
+ DSPBuffer *parent_dspbuf = (DSPBuffer *)parent->deviceBuffer(p_device);
+
+ if (!parent_dspbuf->data()) parent_dspbuf->allocate();
+ if (!parent_dspbuf->data()) return false;
+
+ p_data = parent_dspbuf->data() + subbuf->offset();
+ return true;
+ }
+
+ /*-------------------------------------------------------------------------
+ * We not using a host ptr, allocate a buffer
+ *------------------------------------------------------------------------*/
+ if (!p_data)
+ {
+ if (p_buffer->flags() & CL_MEM_USE_MSMC_TI)
+ p_data = (DSPDevicePtr64) p_device->malloc_msmc(buf_size);
+ else p_data = (DSPDevicePtr64) p_device->malloc_global(buf_size, false);
+
+ if (!p_data) return false;
+
+ p_data_malloced = true;
+ }
+
+ if (p_buffer->type() != MemObject::SubBuffer &&
+ p_buffer->flags() & CL_MEM_COPY_HOST_PTR)
+ Driver::instance()->write(p_device->dspID(), p_data,
+ (uint8_t*)p_buffer->host_ptr(), buf_size);
+
+ // Say to the memobject that we are allocated
+ p_buffer->deviceAllocated(this);
+
+ return true;
+}
+
+DeviceInterface *DSPBuffer::device() const
+{
+ return p_device;
+}
+
+bool DSPBuffer::allocated() const
+{
+ return p_data != 0;
+}
diff --git a/src/core/dsp/buffer.h b/src/core/dsp/buffer.h
new file mode 100644
index 0000000..b8cb860
--- /dev/null
+++ b/src/core/dsp/buffer.h
@@ -0,0 +1,61 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#ifndef __DSP_BUFFER_H__
+#define __DSP_BUFFER_H__
+
+#include "../deviceinterface.h"
+#include "device.h"
+
+namespace Coal
+{
+
+class DSPDevice;
+class MemObject;
+
+class DSPBuffer : public DeviceBuffer
+{
+ public:
+ DSPBuffer(DSPDevice *device, MemObject *buffer, cl_int *rs);
+ ~DSPBuffer();
+
+ bool allocate();
+ DeviceInterface *device() const;
+ DSPDevicePtr64 data() const ;
+ void *nativeGlobalPointer() const ;
+ bool allocated() const;
+
+ private:
+ DSPDevice * p_device;
+ MemObject * p_buffer;
+ DSPDevicePtr64 p_data;
+ bool p_data_malloced;
+ unsigned int p_buffer_idx;
+};
+}
+#endif
diff --git a/src/core/dsp/cmem.cpp b/src/core/dsp/cmem.cpp
new file mode 100644
index 0000000..ee0f938
--- /dev/null
+++ b/src/core/dsp/cmem.cpp
@@ -0,0 +1,271 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "cmem.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+Cmem* Cmem::pInstance = 0;
+
+/*=============================================================================
+* C M E M
+*============================================================================*/
+#define CEIL_DIVIDE(x,y) (((x) + (y) - 1) / y)
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Cmem* Cmem::instance()
+{
+ static Mutex Cmem_instance_mutex;
+ Cmem* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cmem_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Cmem;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Cmem::open()
+******************************************************************************/
+void Cmem::open()
+{
+ int status = cmem_drv_open();
+ ERR(status, "DMA Contiguous Memory Driver Open Error");
+
+ status = cmem_drv_free(0, HOST_BUF_TYPE_DYNAMIC, buf_desc);
+ ERR(status, "DMA Contiguous Memory Free Error");
+
+ status = cmem_drv_alloc(MAX_NUM_HOST_DSP_BUFFERS, HOST_CMEM_BUFFER_SIZE,
+ HOST_BUF_TYPE_DYNAMIC, buf_desc);
+ ERR(status, "DMA Contiguous Memory Alloc Error");
+
+ status = bufmgrCreate(&DmaBufPool, MAX_NUM_HOST_DSP_BUFFERS, buf_desc);
+ ERR(status, "DMA Buffer manager Create Error");
+}
+
+/******************************************************************************
+* Cmem::close()
+******************************************************************************/
+void Cmem::close()
+{
+ bufmgrDelete(&DmaBufPool);
+
+ int status = cmem_drv_free(MAX_NUM_HOST_DSP_BUFFERS, HOST_BUF_TYPE_DYNAMIC,
+ buf_desc);
+ ERR(status, "DMA Contiguous Memory Driver Free Error");
+
+ status = cmem_drv_close();
+ ERR(status, "DMA Contiguous Memory Driver Close Error");
+}
+
+
+/******************************************************************************
+* The dma to the dsp memory system can only occur from contiguous memory, i.e.
+* cmem. CMEM buffers are currently limited to 4M, the algorithm is to
+* copy the general buffer in 4M chunks into CMEM 4M buffers. Then we are able
+* to chain 2 4M buffer writes per DMA initiate. As a result, we will have
+* ceil ( size / 8M ) dma transfers initiated by the routine. to make it
+* concrete at 48M buffer dma, will result in:
+* 12 memcpy calls of 4M each,
+* 12 CMEM buffers allocated of 4M each
+* 6 dma_initiates each with 2 - 4M buffers
+*
+* The algorithm is based one the MAX_CONTIGUOUS_XFER_BUFFERS and
+* HOST_CMEM_BUFFER_SIZE macros. Currently they are 2 and 4M.
+******************************************************************************/
+void Cmem::dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size)
+{
+ static uint32_t trans_id = 0;
+ uint32_t start_trans_id = trans_id;
+ int32_t ret_val;
+ std::deque<uint32_t> dma_ids;
+
+ uint32_t simul_dmas = 4;
+ uint32_t cmem_buffer_size = HOST_CMEM_BUFFER_SIZE;
+ uint32_t tot_buffers = CEIL_DIVIDE(size, cmem_buffer_size);
+ uint32_t circ_buffers = std::min(simul_dmas, tot_buffers);
+ uint32_t last_buffer_size = size - ((tot_buffers-1) * cmem_buffer_size);
+
+ cmem_host_buf_desc_t *host_buf_desc =
+ new cmem_host_buf_desc_t[circ_buffers];
+
+ cmem_host_frame_desc_t *host_frame_desc =
+ new cmem_host_frame_desc_t[circ_buffers];
+
+ /*---------------------------------------------------------------------
+ * Allocate Host CMEM buffers
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < circ_buffers; i++)
+ {
+ ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc[i]);
+ ERR(ret_val, "dma buffer allocation failed");
+ host_frame_desc[i].bufDescP = &host_buf_desc[i];
+ host_frame_desc[i].numBuffers = 1;
+ host_frame_desc[i].frameStartOffset = 0;
+ host_frame_desc[i].frameSize = cmem_buffer_size;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Initiate one transfer at a time based on what fits within the allowed
+ * contiguous buffers per DMA transaction
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < tot_buffers; ++i)
+ {
+ int circ_i = i % simul_dmas;
+ int offset = i * cmem_buffer_size;
+
+ cmem_host_buf_desc_t &buf_desc = host_buf_desc[circ_i];
+ uint32_t cpy_size = buf_desc.length;
+
+ if (i == tot_buffers-1)
+ host_frame_desc[circ_i].frameSize = cpy_size = last_buffer_size;
+
+ memcpy(buf_desc.userAddr, buf + offset, cpy_size);
+
+ /*---------------------------------------------------------------------
+ * Initiate DMA
+ *--------------------------------------------------------------------*/
+ ret_val = pciedrv_dma_write_initiate(dsp_id, addr + offset,
+ &host_frame_desc[circ_i],
+ PCIEDRV_DMA_XFER_NON_BLOCKING,
+ &trans_id);
+ ERR(ret_val, "DMA initiate failed");
+
+ dma_ids.push_back(trans_id);
+
+ if (dma_ids.size() >= simul_dmas)
+ {
+ while (pciedrv_dma_check(dsp_id, dma_ids.front()));
+ dma_ids.pop_front();
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Wait for all dmas to complete
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < dma_ids.size(); i++)
+ while (pciedrv_dma_check(dsp_id, dma_ids[i]));
+
+ /*---------------------------------------------------------------------
+ * Free host CMEM buffers
+ *--------------------------------------------------------------------*/
+ for (int i = 0; i < circ_buffers; i++)
+ {
+ ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc[i]);
+ ERR(ret_val, "dma buffer free failed");
+ }
+
+ delete [] host_buf_desc;
+ delete [] host_frame_desc;
+}
+
+/******************************************************************************
+* Cmem::dma_read
+******************************************************************************/
+void Cmem::dma_read(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size)
+{
+ cmem_host_buf_desc_t host_buf_desc;
+ cmem_host_frame_desc_t host_frame_desc;
+
+ /*-------------------------------------------------------------------------
+ * Calculate total number of host buffers required to fit the data
+ *------------------------------------------------------------------------*/
+ uint32_t num_buffers = CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE);
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+ uint32_t transfer_size = HOST_CMEM_BUFFER_SIZE;
+ uint32_t trans_id;
+ int32_t ret_val;
+
+ /*---------------------------------------------------------------------
+ * Allocate Host buffer
+ *--------------------------------------------------------------------*/
+ ret_val = bufmgrAlloc(DmaBufPool, 1, &host_buf_desc);
+ ERR(ret_val, "dma buffer allocation failed");
+
+ /*---------------------------------------------------------------------
+ * Populate details of data in frame descriptor
+ *--------------------------------------------------------------------*/
+ host_frame_desc.bufDescP = &host_buf_desc;
+ host_frame_desc.numBuffers = 1;
+ host_frame_desc.frameStartOffset = 0;
+ host_frame_desc.frameSize = transfer_size;
+
+ /*-------------------------------------------------------------------------
+ * Initiate one transfer at a time based on what fits within the allowed
+ *------------------------------------------------------------------------*/
+ while (num_buffers)
+ {
+ if (num_buffers == 1)
+ {
+ transfer_size = remaining_size;
+ host_frame_desc.frameSize = transfer_size;
+ }
+
+ /*---------------------------------------------------------------------
+ * Initiate DMA
+ *--------------------------------------------------------------------*/
+ ret_val = pciedrv_dma_read_initiate(dsp_id, addr + offset,
+ &host_frame_desc, PCIEDRV_DMA_XFER_BLOCKING, &trans_id);
+ ERR(ret_val, "DMA initiate failed");
+
+ /*---------------------------------------------------------------------
+ * Copy from dma buffers into buffer
+ *--------------------------------------------------------------------*/
+ memcpy (buf + offset, host_buf_desc.userAddr, transfer_size);
+
+ num_buffers--;
+ offset += transfer_size;
+ remaining_size -= transfer_size;
+ }
+
+ /*---------------------------------------------------------------------
+ * Free Buffer Descriptors
+ *--------------------------------------------------------------------*/
+ ret_val = bufmgrFreeDesc(DmaBufPool, &host_buf_desc);
+ ERR(ret_val, "dma buffer free failed");
+}
diff --git a/src/core/dsp/cmem.h b/src/core/dsp/cmem.h
new file mode 100644
index 0000000..24a6de0
--- /dev/null
+++ b/src/core/dsp/cmem.h
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _CMEM_H
+#define _CMEM_H
+#include "u_lockable.h"
+
+extern "C"
+{
+ #include "pciedrv.h"
+ #include "cmem_drv.h"
+ #include "bufmgr.h"
+}
+
+#define HOST_CMEM_BUFFER_SIZE 0x400000 // 4M
+#define MAX_NUM_HOST_DSP_BUFFERS 128
+
+class Cmem : public Lockable_off
+{
+ public:
+ ~Cmem() { close(); }
+ static Cmem* instance ();
+
+ void open();
+ void close();
+ void dma_write(int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size);
+ void dma_read (int32_t dsp_id, uint32_t addr, uint8_t *buf, uint32_t size);
+
+ private:
+ static Cmem* pInstance;
+
+ cmem_host_buf_desc_t buf_desc[MAX_NUM_HOST_DSP_BUFFERS];
+ void * DmaBufPool;
+
+ Cmem() : DmaBufPool(NULL) { open(); }
+ Cmem(const Cmem&); // copy ctor disallowed
+ Cmem& operator=(const Cmem&); // assignment disallowed
+};
+
+#endif // _CMEM_H
diff --git a/src/core/dsp/core_scheduler.h b/src/core/dsp/core_scheduler.h
new file mode 100644
index 0000000..58d0555
--- /dev/null
+++ b/src/core/dsp/core_scheduler.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "u_lockable.h"
+#ifndef _CORE_SCHEDULER_H
+#define _CORE_SCHEDULER_H
+
+class CoreScheduler : public Lockable
+{
+ public:
+ CoreScheduler() : p_avail(0xff) {}
+
+ void free(int core)
+ {
+ Lock lock(this);
+ p_avail |= (1 << core);
+ CV.notify_one();
+ }
+
+ int allocate()
+ {
+ Lock lock(this);
+
+ /*---------------------------------------------------------------------
+ * Wait in a loop in case the condvar is falsely signalled
+ *--------------------------------------------------------------------*/
+ while (!p_avail) CV.wait(lock.raw());
+
+ for (int i=0, mask = 1; i < 8; ++i, mask <<= 1)
+ if (p_avail & mask) { p_avail &= ~mask; return i; }
+ }
+
+ private:
+ unsigned char p_avail;
+ CondVar CV;
+};
+
+#endif //_CORE_SCHEDULER_H
diff --git a/src/core/dsp/database.h b/src/core/dsp/database.h
new file mode 100644
index 0000000..ca4d69e
--- /dev/null
+++ b/src/core/dsp/database.h
@@ -0,0 +1,112 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DATABASE_H__
+#define __DATABASE_H__
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sqlite3.h>
+
+using namespace std;
+
+class Database
+{
+ public:
+ Database(const char* filename) : database(NULL) { open(filename); }
+ ~Database() { close(); }
+
+ void close()
+ {
+ if (database) sqlite3_close(database);
+ database = NULL;
+ }
+
+ vector<vector<string> > query(const char* query)
+ {
+ sqlite3_stmt *statement;
+ vector<vector<string> > results;
+ const int retry_limit = 20;
+ int retries = 0;
+
+ int rc = sqlite3_prepare_v2(database, query, -1, &statement, 0);
+
+ while ((rc == SQLITE_BUSY || rc == SQLITE_LOCKED) &&
+ ++retries <= retry_limit)
+ {
+ sqlite3_finalize(statement);
+ usleep(100);
+ rc = sqlite3_prepare_v2(database, query, -1, &statement, 0);
+ }
+
+ if (rc == SQLITE_OK)
+ {
+ int cols = sqlite3_column_count(statement);
+ int result = 0;
+
+ while (true)
+ {
+ result = sqlite3_step(statement);
+
+ if (result == SQLITE_ROW)
+ {
+ vector<string> values;
+ for (int col = 0; col < cols; col++)
+ values.push_back((char*)sqlite3_column_text(statement,col));
+ results.push_back(values);
+ }
+ else break;
+ }
+
+ sqlite3_finalize(statement);
+ }
+
+ string error = sqlite3_errmsg(database);
+ if (error != "not an error")
+ std::cout << query << " " << error << std::endl;
+
+ return results;
+ }
+
+ private:
+ sqlite3 *database;
+
+ private:
+ bool open(const char* filename)
+ {
+ if (sqlite3_open(filename, &database) == SQLITE_OK)
+ {
+ sqlite3_busy_timeout(database, 1000);
+ return true;
+ }
+ return false;
+ }
+
+};
+
+#endif
diff --git a/src/core/dsp/device.cpp b/src/core/dsp/device.cpp
new file mode 100644
index 0000000..32cd9b0
--- /dev/null
+++ b/src/core/dsp/device.cpp
@@ -0,0 +1,1135 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "../platform.h"
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "program.h"
+#include <cstdlib>
+#include <algorithm>
+#include <limits.h>
+#include "CL/cl_ext.h"
+
+#include <core/config.h>
+#include "../propertylist.h"
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+#include "../program.h"
+#include "../util.h"
+
+#include "driver.h"
+#include "mailbox.h"
+
+extern "C"
+{
+ #include "dload_api.h"
+ #include <ti/runtime/mmap/include/mmap_resource.h>
+
+}
+
+#include <cstring>
+#include <cstdlib>
+#include <unistd.h>
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+using namespace Coal;
+
+Mailbox* Mailbox::pInstance = 0;
+
+/******************************************************************************
+* On DSPC868X the mailboxes are remote on the device DDR. On Hawking the
+* mailboxes are in shared DDR
+******************************************************************************/
+#ifdef DSPC868X
+#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_REMOTE
+#else
+#define MAILBOX_LOCATION MPM_MAILBOX_MEMORY_LOCATION_LOCAL
+
+#include "shmem.h"
+unsigned dsp_speed()
+{
+ const unsigned DSP_PLL = 122880000;
+ const unsigned pagesize = 0x1000;
+
+ shmem_persistent bootcfg_page;
+ shmem_persistent clock_page;
+
+ bootcfg_page.configure(0x02620000, pagesize);
+ clock_page.configure(0x02310000, pagesize);
+
+ char *BOOTCFG_BASE_ADDR = (char*)bootcfg_page.map(0x02620000, pagesize);
+ char *CLOCK_BASE_ADDR = (char*)clock_page.map(0x02310000, pagesize);
+
+ int MAINPLLCTL0 = (*(int*)(BOOTCFG_BASE_ADDR + 0x350));
+ int MULT = (*(int*)(CLOCK_BASE_ADDR + 0x110));
+ int OUTDIV = (*(int*)(CLOCK_BASE_ADDR + 0x108));
+
+ unsigned mult = 1 + ((MULT & 0x3F) | ((MAINPLLCTL0 & 0x7F000) >> 6));
+ unsigned prediv = 1 + (MAINPLLCTL0 & 0x3F);
+ unsigned output_div = 1 + ((OUTDIV >> 19) & 0xF);
+ unsigned speed = DSP_PLL * mult / prediv / output_div;
+
+ bootcfg_page.unmap(BOOTCFG_BASE_ADDR, pagesize);
+ clock_page.unmap(CLOCK_BASE_ADDR, pagesize);
+
+ return speed / 1000000;
+}
+#endif
+
+/*-----------------------------------------------------------------------------
+* Declare our threaded dsp handler function
+*----------------------------------------------------------------------------*/
+void *dsp_worker(void* data);
+void HOSTwait (unsigned char dsp_id);
+
+/******************************************************************************
+* DSPDevice::DSPDevice(unsigned char dsp_id)
+******************************************************************************/
+DSPDevice::DSPDevice(unsigned char dsp_id)
+ : DeviceInterface (),
+ p_cores (8),
+ p_num_events (0),
+ p_dsp_mhz (1000), // 1.00 GHz
+ p_worker (0),
+ p_rx_mbox (0),
+ p_tx_mbox (0),
+ p_stop (false),
+ p_initialized (false),
+ p_dsp_id (dsp_id),
+ p_device_msmc_heap(),
+ p_device_ddr_heap1(),
+ p_device_ddr_heap2(),
+ p_device_ddr_heap3(),
+ p_device_l2_heap (),
+ p_dload_handle (0),
+ p_complete_pending(),
+ p_mpax_default_res(NULL)
+{
+ Driver *driver = Driver::instance();
+
+ void *hdl = driver->reset_and_load(dsp_id);
+
+ p_addr_kernel_config = driver->get_symbol(hdl, "kernel_config_l2");
+ p_addr_local_mem = driver->get_symbol(hdl, "ocl_local_mem_start");
+ p_addr_mbox_d2h_phys = driver->get_symbol(hdl, "mbox_d2h_phys");
+ p_addr_mbox_h2d_phys = driver->get_symbol(hdl, "mbox_h2d_phys");
+ p_size_local_mem = driver->get_symbol(hdl, "ocl_local_mem_size");
+ p_size_mbox_d2h = driver->get_symbol(hdl, "mbox_d2h_size");
+ p_size_mbox_h2d = driver->get_symbol(hdl, "mbox_h2d_size");
+
+ /*-------------------------------------------------------------------------
+ * These 4 variables were previously retrieved from the monitor out file.
+ * They are now determined by query of the CMEM system.
+ *------------------------------------------------------------------------*/
+ //p_addr_global_mem = driver->get_symbol(hdl, "ocl_global_mem_start");
+ //p_addr_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_start");
+ //p_size_global_mem = driver->get_symbol(hdl, "ocl_global_mem_size");
+ //p_size_msmc_mem = driver->get_symbol(hdl, "ocl_msmc_mem_size");
+
+#if 0
+ // Adjust p_size_global_mem for PG1.0 board, monitor takes 2MB
+ #define MONITOR_MEM 2
+ uint32_t mem_reserve = parse_file_line_value("/proc/cmdline",
+ "mem_reserve=", 0);
+ if (mem_reserve > 0 && mem_reserve*1024*1024 < p_size_global_mem)
+ p_size_global_mem = (mem_reserve - MONITOR_MEM) * 1024 * 1024;
+
+ char *dsp_global_mem_size = getenv("TI_OCL_DSP_GLOBAL_MEM_SIZE");
+ if (dsp_global_mem_size)
+ p_size_global_mem = atol(dsp_global_mem_size);
+
+ // Ordering is important: global in CMEM block 0, msmc in CMEM block 1
+ driver->cmem_init(p_addr_global_mem, p_size_global_mem,
+ p_addr_msmc_mem, p_size_msmc_mem);
+#endif
+ p_addr64_global_mem = 0;
+ p_size64_global_mem = 0;
+ p_addr_msmc_mem = 0;
+ p_size_msmc_mem = 0;
+ DSPDevicePtr64 global3 = 0;
+ uint64_t gsize3 = 0;
+ driver->cmem_init(&p_addr64_global_mem, &p_size64_global_mem,
+ &p_addr_msmc_mem, &p_size_msmc_mem,
+ &global3, &gsize3);
+
+ DSPDevicePtr64 global1 = p_addr64_global_mem;
+ DSPDevicePtr64 global2 = 0;
+ uint64_t gsize1 = p_size64_global_mem;
+ uint64_t gsize2 = 0;
+ driver->split_ddr_memory(p_addr64_global_mem, p_size64_global_mem,
+ global1, gsize1, global2, gsize2, gsize3);
+
+ driver->shmem_configure(global1, gsize1, 0);
+ if (gsize2 > 0) driver->shmem_configure(global2, gsize2, 0);
+ if (gsize3 > 0) driver->shmem_configure(global3, gsize3, 0);
+ driver->shmem_configure(p_addr_msmc_mem, p_size_msmc_mem, 1);
+ driver->shmem_configure(p_addr_mbox_d2h_phys, p_size_mbox_d2h);
+ driver->shmem_configure(p_addr_mbox_h2d_phys, p_size_mbox_h2d);
+ for (int core=0; core < 8; core++)
+ driver->shmem_configure(((0x10 + core) << 24) + p_addr_local_mem,
+ p_size_local_mem);
+
+ driver->free_image_handle(hdl);
+
+ /*-------------------------------------------------------------------------
+ * Setup the DSP heaps for memory allocation
+ *------------------------------------------------------------------------*/
+ p_device_ddr_heap1.configure(global1, gsize1);
+ p_device_ddr_heap2.configure(global2, gsize2, true);
+ p_device_ddr_heap3.configure(global3, gsize3, true);
+ p_device_l2_heap.configure (p_addr_local_mem, p_size_local_mem);
+ p_device_msmc_heap.configure(p_addr_msmc_mem, p_size_msmc_mem);
+
+ /*-------------------------------------------------------------------------
+ * initialize the mailboxes on the cores, so they can receive an exit cmd
+ *------------------------------------------------------------------------*/
+ Mailbox* mb_instance = Mailbox::instance();
+
+ uint32_t mailboxallocsize = mpm_mailbox_get_alloc_size();
+
+ p_tx_mbox = (void*)malloc(mailboxallocsize);
+ p_rx_mbox = (void*)malloc(mailboxallocsize);
+
+ mpm_mailbox_config_t mbConfig;
+ mbConfig.mem_start_addr =
+ (uint32_t)driver->map(p_addr_mbox_h2d_phys, p_size_mbox_h2d);
+
+ mbConfig.mem_size = p_size_mbox_h2d;
+ mbConfig.max_payload_size = mbox_payload;
+
+ int tx_status = mb_instance->create(p_tx_mbox,
+ NULL,
+ MAILBOX_LOCATION,
+ MPM_MAILBOX_DIRECTION_SEND, &mbConfig);
+
+ mbConfig.mem_start_addr =
+ (uint32_t)driver->map(p_addr_mbox_d2h_phys, p_size_mbox_d2h);
+ mbConfig.mem_size = p_size_mbox_d2h;
+
+ int rx_status = mb_instance->create(p_rx_mbox,
+ NULL,
+ MAILBOX_LOCATION,
+ MPM_MAILBOX_DIRECTION_RECEIVE, &mbConfig);
+
+ tx_status |= mb_instance->open(p_tx_mbox);
+ rx_status |= mb_instance->open(p_rx_mbox);
+
+ if (tx_status != 0 || rx_status != 0)
+ std::cout << "Could not create mailboxes for dsp "
+ << p_dsp_id << std::endl;
+
+
+#ifdef DSPC868X
+ char *ghz1 = getenv("TI_OCL_DSP_1_25GHZ");
+ if (ghz1) p_dsp_mhz = 1250; // 1.25 GHz
+#else
+ mail_to(frequencyMsg);
+
+ int ret = 0;
+ do
+ {
+ while (!mail_query()) ;
+ ret = mail_from();
+ } while (ret == -1);
+
+ p_dsp_mhz = ret;
+#endif
+
+}
+
+
+/******************************************************************************
+* void DSPDevice::init()
+******************************************************************************/
+void DSPDevice::init()
+{
+ if (p_initialized) return;
+
+ /*-------------------------------------------------------------------------
+ * Initialize the locking machinery and create worker threads
+ *------------------------------------------------------------------------*/
+ pthread_cond_init(&p_events_cond, 0);
+ pthread_mutex_init(&p_events_mutex, 0);
+ pthread_create(&p_worker, 0, &dsp_worker, this);
+
+ p_initialized = true;
+}
+
+/******************************************************************************
+* DSPDevice::~DSPDevice()
+******************************************************************************/
+DSPDevice::~DSPDevice()
+{
+ /*-------------------------------------------------------------------------
+ * Inform the cores on the device to stop listening for commands
+ *------------------------------------------------------------------------*/
+ mail_to(exitMsg);
+
+ free (p_tx_mbox);
+ free (p_rx_mbox);
+
+ /*-------------------------------------------------------------------------
+ * Only need to close the driver for one of the devices
+ *------------------------------------------------------------------------*/
+ if (p_dsp_id == 0) Driver::instance()->close();
+
+ if (!p_initialized) return;
+
+ /*-------------------------------------------------------------------------
+ * Terminate the workers and wait for them
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_stop = true;
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+
+ pthread_join(p_worker, 0);
+
+ pthread_mutex_destroy(&p_events_mutex);
+ pthread_cond_destroy(&p_events_cond);
+}
+
+/******************************************************************************
+* DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer)
+******************************************************************************/
+DeviceBuffer *DSPDevice::createDeviceBuffer(MemObject *buffer, cl_int *rs)
+ { return (DeviceBuffer *)new DSPBuffer(this, buffer, rs); }
+
+/******************************************************************************
+* DeviceProgram *DSPDevice::createDeviceProgram(Program *program)
+******************************************************************************/
+DeviceProgram *DSPDevice::createDeviceProgram(Program *program)
+ { return (DeviceProgram *)new DSPProgram(this, program); }
+
+/******************************************************************************
+* DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel,
+******************************************************************************/
+DeviceKernel *DSPDevice::createDeviceKernel(Kernel *kernel,
+ llvm::Function *function)
+ { return (DeviceKernel *)new DSPKernel(this, kernel); }
+
+/******************************************************************************
+* cl_int DSPDevice::initEventDeviceData(Event *event)
+******************************************************************************/
+cl_int DSPDevice::initEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::MapBuffer:
+ {
+ MapBufferEvent *e = (MapBufferEvent*) event;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ e->setPtr((char*)e->buffer()->host_ptr() + e->offset());
+ break;
+ }
+
+ DSPBuffer *buf = (DSPBuffer*) e->buffer()->deviceBuffer(this);
+ DSPDevicePtr64 data = buf->data() + e->offset();
+
+ // DO NOT INVALIDATE! Here only initializes host_addr, it cannot
+ // be used before MapBuffer event is scheduled and processed!
+ void* host_addr = Driver::instance()->map(data, e->cb(), false);
+ e->setPtr(host_addr);
+ break;
+ }
+
+ case Event::MapImage: break;
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *)event;
+ Program *p = (Program *)e->kernel()->parent();
+ DSPProgram *prog = (DSPProgram *)p->deviceDependentProgram(this);
+
+ /*-----------------------------------------------------------------
+ * Just in time loading
+ *----------------------------------------------------------------*/
+ if (!prog->is_loaded() && !prog->load())
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+
+ DSPKernel *dspkernel = (DSPKernel*)e->deviceKernel();
+
+ cl_int ret = dspkernel->preAllocBuffers();
+ if (ret != CL_SUCCESS) return ret;
+
+ // ASW TODO do something
+
+ // Set device-specific data
+ DSPKernelEvent *dsp_e = new DSPKernelEvent(this, e);
+ e->setDeviceData((void *)dsp_e);
+ break;
+ }
+ default: break;
+ }
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* void DSPDevice::freeEventDeviceData(Event *event)
+******************************************************************************/
+void DSPDevice::freeEventDeviceData(Event *event)
+{
+ switch (event->type())
+ {
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ DSPKernelEvent *dsp_e = (DSPKernelEvent *)event->deviceData();
+ if (dsp_e) delete dsp_e;
+ }
+ default: break;
+ }
+}
+
+/******************************************************************************
+* void DSPDevice::pushEvent(Event *event)
+******************************************************************************/
+void DSPDevice::pushEvent(Event *event)
+{
+ /*-------------------------------------------------------------------------
+ * Add an event in the list
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ p_events.push_back(event);
+ p_num_events++; // Way faster than STL list::size() !
+
+ pthread_cond_broadcast(&p_events_cond);
+ pthread_mutex_unlock(&p_events_mutex);
+}
+
+bool DSPDevice::stop() { return p_stop; }
+bool DSPDevice::availableEvent() { return p_num_events > 0; }
+
+/******************************************************************************
+* Event *DSPDevice::getEvent(bool &stop)
+******************************************************************************/
+Event *DSPDevice::getEvent(bool &stop)
+{
+ /*-------------------------------------------------------------------------
+ * Return the first event in the list, if any. Remove it if it is a
+ * single-shot event.
+ *------------------------------------------------------------------------*/
+ pthread_mutex_lock(&p_events_mutex);
+
+ while (p_num_events == 0 && !p_stop)
+ pthread_cond_wait(&p_events_cond, &p_events_mutex);
+
+ if (p_stop)
+ {
+ pthread_mutex_unlock(&p_events_mutex);
+ stop = true;
+ return 0;
+ }
+
+ Event *event = p_events.front();
+ p_num_events--;
+ p_events.pop_front();
+
+ pthread_mutex_unlock(&p_events_mutex);
+
+ return event;
+}
+
+void DSPDevice::push_complete_pending(uint32_t idx, Event* const data)
+ { p_complete_pending.push(idx, data); }
+
+bool DSPDevice::get_complete_pending(uint32_t idx, Event*& data)
+ { return p_complete_pending.try_pop(idx, data); }
+
+void DSPDevice::dump_complete_pending() { p_complete_pending.dump(); }
+
+bool DSPDevice::any_complete_pending() { return !p_complete_pending.empty(); }
+
+/******************************************************************************
+* Device's decision about whether CommandQueue should push more events over
+* This number could be tuned (e.g. using ooo example). Note that p_num_events
+* are in device's queue, but not yet executed.
+******************************************************************************/
+bool DSPDevice::gotEnoughToWorkOn() { return p_num_events > 0; }
+
+/******************************************************************************
+* Getter functions
+******************************************************************************/
+unsigned int DSPDevice::numDSPs() const { return p_cores; }
+float DSPDevice::dspMhz() const { return p_dsp_mhz; }
+unsigned char DSPDevice::dspID() const { return p_dsp_id; }
+DLOAD_HANDLE DSPDevice::dload_handle() const { return p_dload_handle; }
+
+
+int DSPDevice::load(const char *filename)
+{
+ if (!p_dload_handle)
+ {
+ p_dload_handle = DLOAD_create((void*)this);
+ DLOAD_initialize(p_dload_handle);
+ }
+
+ FILE *fp = fopen(filename, "rb");
+ if (!fp) { printf("can't open OpenCL Program file\n"); exit(1); }
+
+ int prog_handle = DLOAD_load(p_dload_handle, fp);
+ fclose(fp);
+ return prog_handle;
+}
+
+bool DSPDevice::unload(int file_handle)
+{
+ if (p_dload_handle)
+ return DLOAD_unload(p_dload_handle, file_handle);
+ return false;
+}
+
+DSPDevicePtr DSPDevice::get_local_scratch(uint32_t &size, uint32_t &block_size)
+{
+ uint64_t size64;
+ DSPDevicePtr64 addr64 = p_device_l2_heap.max_block_size(size64, block_size);
+ size = (uint32_t) size64;
+ return (DSPDevicePtr) addr64;
+}
+
+DSPDevicePtr DSPDevice::malloc_local(size_t size)
+ { return p_device_l2_heap.malloc(size,true); }
+
+void DSPDevice::free_local(DSPDevicePtr addr)
+ { p_device_l2_heap.free(addr); }
+
+DSPDevicePtr DSPDevice::malloc_msmc(size_t size)
+ { return p_device_msmc_heap.malloc(size,true); }
+
+void DSPDevice::free_msmc(DSPDevicePtr addr)
+ { p_device_msmc_heap.free(addr); }
+
+// TODO: examine the flag, the logic, etc
+#define FRACTION_PERSISTENT_FOR_BUFFER 8
+DSPDevicePtr64 DSPDevice::malloc_global(size_t size, bool prefer_32bit)
+{
+ if (prefer_32bit) return p_device_ddr_heap1.malloc(size, true);
+
+ DSPDevicePtr64 addr = 0;
+ uint64_t size64 = 0;
+ uint32_t block_size;
+ p_device_ddr_heap1.max_block_size(size64, block_size);
+ if (size64 / size > FRACTION_PERSISTENT_FOR_BUFFER)
+ addr = p_device_ddr_heap1.malloc(size, true);
+ if (!addr)
+ // addr = Driver::instance()->cmem_ondemand_malloc(size);
+ addr = p_device_ddr_heap2.malloc(size, true);
+ if (!addr)
+ addr = p_device_ddr_heap3.malloc(size, true);
+ if (!addr)
+ addr = p_device_ddr_heap1.malloc(size, true); // give it another chance
+ return addr;
+}
+
+void DSPDevice::free_global(DSPDevicePtr64 addr)
+{
+ if (addr < DSP_36BIT_ADDR)
+ p_device_ddr_heap1.free(addr);
+ else
+ // Driver::instance()->cmem_ondemand_free(addr);
+ if (p_device_ddr_heap2.free(addr) == -1)
+ p_device_ddr_heap3.free(addr);
+}
+
+void DSPDevice::mail_to(Msg_t &msg)
+{
+ static unsigned trans_id = 0xC0DE0000;
+ Mailbox::instance()->write(p_tx_mbox, (uint8_t*)&msg, sizeof(Msg_t),
+ trans_id++);
+}
+
+bool DSPDevice::mail_query()
+{
+ return Mailbox::instance()->query(p_rx_mbox);
+}
+
+int DSPDevice::mail_from()
+{
+ uint32_t size_rx, trans_id_rx;
+ Msg_t rxmsg;
+
+ Mailbox::instance()->read(p_rx_mbox, (uint8_t*)&rxmsg, &size_rx,
+ &trans_id_rx);
+
+ if (rxmsg.command == ERROR)
+ {
+ printf("%s", rxmsg.u.message);
+ return -1;
+ }
+
+ if (rxmsg.command == PRINT)
+ {
+ printf("[core %c] %s", rxmsg.u.message[0], rxmsg.u.message+1);
+ return -1;
+ }
+
+ return trans_id_rx;
+}
+
+/******************************************************************************
+* void* DSPDevice::get_mpax_default_res, only need to be computed once
+******************************************************************************/
+void* DSPDevice::get_mpax_default_res()
+{
+ if (p_mpax_default_res == NULL)
+ {
+ p_mpax_default_res = malloc(sizeof(keystone_mmap_resources_t));
+ memset(p_mpax_default_res, 0, sizeof(keystone_mmap_resources_t));
+
+#define NUM_VIRT_HEAPS 2
+ uint32_t xmc_regs[MAX_XMCSES_MPAXS] = {3, 4, 5, 6, 7, 8, 9};
+ uint32_t ses_regs[MAX_XMCSES_MPAXS] = {1, 2, 3, 4, 5, 6, 7};
+ uint32_t heap_base[NUM_VIRT_HEAPS] = {0x80000000, 0xC0000000};
+ uint32_t heap_size[NUM_VIRT_HEAPS] = {0x20000000, 0x40000000};
+ for (int i = 0; i < MAX_XMCSES_MPAXS; i++)
+ {
+ xmc_regs[i] = FIRST_FREE_XMC_MPAX + i;
+ ses_regs[i] = FIRST_FREE_SES_MPAX + i;
+ }
+ keystone_mmap_resource_init(MAX_XMCSES_MPAXS, xmc_regs, ses_regs,
+ NUM_VIRT_HEAPS, heap_base, heap_size,
+ (keystone_mmap_resources_t *) p_mpax_default_res);
+
+ }
+ return p_mpax_default_res;
+}
+
+/******************************************************************************
+* cl_int DSPDevice::info
+******************************************************************************/
+cl_int DSPDevice::info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union
+ {
+ cl_device_type cl_device_type_var;
+ cl_uint cl_uint_var;
+ size_t size_t_var;
+ cl_ulong cl_ulong_var;
+ cl_bool cl_bool_var;
+ cl_device_fp_config cl_device_fp_config_var;
+ cl_device_mem_cache_type cl_device_mem_cache_type_var;
+ cl_device_local_mem_type cl_device_local_mem_type_var;
+ cl_device_exec_capabilities cl_device_exec_capabilities_var;
+ cl_command_queue_properties cl_command_queue_properties_var;
+ cl_platform_id cl_platform_id_var;
+ size_t work_dims[MAX_WORK_DIMS];
+ };
+
+ uint64_t maxblock;
+ uint32_t dummy;
+
+ switch (param_name)
+ {
+ case CL_DEVICE_TYPE:
+ SIMPLE_ASSIGN(cl_device_type, CL_DEVICE_TYPE_ACCELERATOR);
+ break;
+
+ case CL_DEVICE_VENDOR_ID:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ SIMPLE_ASSIGN(cl_uint, numDSPs());
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS);
+ break;
+
+ /*-----------------------------------------------------------------
+ * Set to local mem size / 128 so that conf basic/local_kernel_def
+ * can allocate and pass. This allows a long16 for each wi to exist
+ * in local mem.
+ *----------------------------------------------------------------*/
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, 0xffffffff); //p_size_local_mem / 128);
+ break;
+
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ for (int i=0; i<MAX_WORK_DIMS; ++i)
+ {
+ work_dims[i] = 0xffffffff; //p_size_local_mem / 128;
+ }
+ value_length = MAX_WORK_DIMS * sizeof(size_t);
+ value = &work_dims;
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 1);
+ break;
+
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ SIMPLE_ASSIGN(cl_uint, dspMhz());
+ break;
+
+ case CL_DEVICE_ADDRESS_BITS:
+ SIMPLE_ASSIGN(cl_uint, 32);
+ break;
+
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 0); // images not supported
+ break;
+
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, std::min(p_device_ddr_heap1.size(), (cl_ulong)1ul << 30));
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); // images not supported
+ break;
+
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ SIMPLE_ASSIGN(size_t, 0); //images not supported
+ break;
+
+ case CL_DEVICE_IMAGE_SUPPORT:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE); //images not supported
+ break;
+
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ SIMPLE_ASSIGN(size_t, 116); // ASW TODO - needs to be 1024
+ break;
+
+ case CL_DEVICE_MAX_SAMPLERS:
+ SIMPLE_ASSIGN(cl_uint, 0); //images not supported
+ break;
+
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ SIMPLE_ASSIGN(cl_uint, 1024); // 128 byte aligned
+ break;
+
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 128);
+ break;
+
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ // Currently don't support CL_FP_DENORM
+ // ASW TODO: Investigate others
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST);
+ break;
+
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ SIMPLE_ASSIGN(cl_device_fp_config,
+ CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO |
+ CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ SIMPLE_ASSIGN(cl_device_mem_cache_type, CL_READ_WRITE_CACHE);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ SIMPLE_ASSIGN(cl_uint, 128);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, 128*1024);
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap1.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT1_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap2.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT2_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_ddr_heap3.size());
+ break;
+
+ case CL_DEVICE_MSMC_MEM_SIZE_TI:
+ SIMPLE_ASSIGN(cl_ulong, p_device_msmc_heap.size());
+ break;
+
+ case CL_DEVICE_GLOBAL_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap1.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT1_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap2.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_GLOBAL_EXT2_MEM_MAX_ALLOC_TI:
+ p_device_ddr_heap3.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_MSMC_MEM_MAX_ALLOC_TI:
+ p_device_msmc_heap.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_MAX_ALLOC_TI:
+ p_device_l2_heap.max_block_size(maxblock, dummy);
+ SIMPLE_ASSIGN(cl_ulong, maxblock);
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, 64<<10);
+ break;
+
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ SIMPLE_ASSIGN(cl_device_local_mem_type, CL_LOCAL);
+ break;
+
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, p_device_l2_heap.size());
+ break;
+
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ // ASW TODO - check answer
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ SIMPLE_ASSIGN(cl_bool, CL_FALSE);
+ break;
+
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ SIMPLE_ASSIGN(size_t, 1000); // 1000 nanoseconds = 1 microsecond
+ break;
+
+ case CL_DEVICE_ENDIAN_LITTLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ SIMPLE_ASSIGN(cl_bool, CL_TRUE);
+ break;
+
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ SIMPLE_ASSIGN(cl_device_exec_capabilities, CL_EXEC_KERNEL);
+ break;
+
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ SIMPLE_ASSIGN(cl_command_queue_properties,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+ CL_QUEUE_PROFILING_ENABLE);
+ break;
+
+ case CL_DEVICE_NAME:
+ // ASW TODO add device number suffix
+#ifdef DSPC868X
+ STRING_ASSIGN("TI TMS320C6678 DSP");
+#else
+ STRING_ASSIGN("TI K2H DSP (8x C66)");
+#endif
+ break;
+
+ case CL_DEVICE_VENDOR:
+ STRING_ASSIGN("Texas Instruments, Inc.");
+ break;
+
+ case CL_DRIVER_VERSION:
+ STRING_ASSIGN("" COAL_VERSION);
+ break;
+
+ case CL_DEVICE_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_DEVICE_VERSION:
+ STRING_ASSIGN("OpenCL 1.1 TI " COAL_VERSION);
+ break;
+
+ case CL_DEVICE_EXTENSIONS:
+ STRING_ASSIGN("cl_khr_byte_addressable_store"
+ " cl_khr_global_int32_base_atomics"
+ " cl_khr_global_int32_extended_atomics"
+ " cl_khr_local_int32_base_atomics"
+ " cl_khr_local_int32_extended_atomics"
+ " cl_khr_fp64"
+ " cl_ti_msmc_buffers")
+ break;
+
+ case CL_DEVICE_PLATFORM:
+ SIMPLE_ASSIGN(cl_platform_id, &the_platform);
+ break;
+
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ SIMPLE_ASSIGN(cl_uint, 8);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ SIMPLE_ASSIGN(cl_uint, 4);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ SIMPLE_ASSIGN(cl_uint, 2);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ SIMPLE_ASSIGN(cl_uint, 1);
+ break;
+
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ SIMPLE_ASSIGN(cl_uint, 0);
+ break;
+
+ case CL_DEVICE_OPENCL_C_VERSION:
+ STRING_ASSIGN("OpenCL C 1.1 LLVM " LLVM_VERSION);
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* Call back functions from the target loader
+******************************************************************************/
+extern "C"
+{
+
+/*****************************************************************************/
+/* DLIF_ALLOCATE() - Return the load address of the segment/section */
+/* described in its parameters and record the run address in */
+/* run_address field of DLOAD_MEMORY_REQUEST. */
+/*****************************************************************************/
+BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *targ_req)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Get pointers to API segment and file descriptors. */
+ /*------------------------------------------------------------------------*/
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment;
+
+ uint32_t addr;
+
+ if (obj_desc->target_address >> 20 == 0x008)
+ addr = (uint32_t)device->malloc_local (obj_desc->memsz_in_bytes);
+ else if (obj_desc->target_address >> 24 == 0x0C)
+ addr = (uint32_t)device->malloc_msmc (obj_desc->memsz_in_bytes);
+ else addr = (uint32_t)device->malloc_global(obj_desc->memsz_in_bytes);
+
+#if DEBUG
+ printf("DLIF_allocate: %d bytes starting at 0x%x (relocated from 0x%x)\n",
+ obj_desc->memsz_in_bytes, (uint32_t)addr,
+ (uint32_t)obj_desc->target_address);
+#endif
+
+ obj_desc->target_address = (TARGET_ADDRESS) addr;
+
+ /*------------------------------------------------------------------------*/
+ /* Target memory request was successful. */
+ /*------------------------------------------------------------------------*/
+ return addr == 0 ? 0 : 1;
+}
+
+/*****************************************************************************/
+/* DLIF_RELEASE() - Unmap or free target memory that was previously */
+/* allocated by DLIF_allocate(). */
+/*****************************************************************************/
+BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+
+ if (ptr->target_address >> 20 == 0x008)
+ device->free_local ((DSPDevicePtr)ptr->target_address);
+ else if (ptr->target_address >> 24 == 0x0C)
+ device->free_msmc ((DSPDevicePtr)ptr->target_address);
+ else device->free_global((DSPDevicePtr)ptr->target_address);
+
+#if DEBUG
+ printf("DLIF_free: %d bytes starting at 0x%x\n",
+ ptr->memsz_in_bytes, (uint32_t)ptr->target_address);
+#endif
+
+ return 1;
+}
+
+/*****************************************************************************/
+/* DLIF_WRITE() - Write updated (relocated) segment contents to target */
+/* memory. */
+/*****************************************************************************/
+BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req)
+{
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = req->segment;
+ DSPDevice* device = (DSPDevice*) client_handle;
+ int dsp_id = device->dspID();
+
+ Driver::instance()->write (dsp_id,
+ (uint32_t)obj_desc->target_address,
+ (uint8_t*)req->host_address,
+ obj_desc->memsz_in_bytes);
+
+#if DEBUG
+ printf("DLIF_write (dsp:%d): %d bytes starting at 0x%x\n",
+ dsp_id, obj_desc->memsz_in_bytes,
+ (uint32_t)obj_desc->target_address);
+#endif
+
+ extern DSPProgram::segment_list *segments;
+
+ if (segments) segments->push_back
+ (DSPProgram::seg_desc((DSPDevicePtr)obj_desc->target_address, obj_desc->memsz_in_bytes, req->flags));
+
+ return 1;
+}
+
+/******************************************************************************
+* DLIF_LOAD_DEPENDENT()
+******************************************************************************/
+int DLIF_load_dependent(void* client_handle, const char* so_name)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+ FILE* fp = fopen(so_name, "rb");
+
+ if (!fp)
+ {
+ DLIF_error(DLET_FILE, "Can't open dependent file '%s'.\n", so_name);
+ return 0;
+ }
+
+ int to_ret = DLOAD_load(device->dload_handle(), fp);
+
+ if (!to_ret)
+ DLIF_error(DLET_MISC, "Failed load of dependent file '%s'.\n", so_name);
+
+ fclose(fp);
+ return to_ret;
+}
+
+/******************************************************************************
+* DLIF_UNLOAD_DEPENDENT()
+******************************************************************************/
+void DLIF_unload_dependent(void* client_handle, uint32_t file_handle)
+{
+ DSPDevice* device = (DSPDevice*) client_handle;
+ DLOAD_unload(device->dload_handle(), file_handle);
+}
+
+}
+
+void dump_hex(char *addr, int bytes)
+{
+ int cnt = 0;
+
+ printf("\n");
+ while (cnt < bytes)
+ {
+ for (int col = 0; col < 16; ++col)
+ {
+ printf("%02x ", addr[cnt++] & 0xff);
+ if (cnt >= bytes) break;
+ }
+ printf("\n");
+ }
+}
+
diff --git a/src/core/dsp/device.h b/src/core/dsp/device.h
new file mode 100644
index 0000000..4a6f32a
--- /dev/null
+++ b/src/core/dsp/device.h
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_DEVICE_H__
+#define __DSP_DEVICE_H__
+
+extern "C" {
+#include "dload_api.h"
+}
+
+#include "../deviceinterface.h"
+#include "dspheap.h"
+#include "message.h"
+#include "u_concurrent_map.h"
+#include "kernel.h"
+#include <pthread.h>
+#include <string>
+#include <list>
+
+namespace Coal
+{
+
+class MemObject;
+class Event;
+class Program;
+class Kernel;
+
+class DSPDevice : public DeviceInterface
+{
+ public:
+ DSPDevice(unsigned char dsp_id);
+ ~DSPDevice();
+
+ void init();
+
+ cl_int info(cl_device_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
+ DeviceProgram *createDeviceProgram(Program *program);
+ DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function);
+
+ cl_int initEventDeviceData(Event *event);
+ void freeEventDeviceData(Event *event);
+
+ void pushEvent(Event *event);
+ bool stop();
+ bool availableEvent();
+ Event *getEvent(bool &stop);
+
+ unsigned int numDSPs() const;
+ float dspMhz() const;
+ unsigned char dspID() const;
+ DLOAD_HANDLE dload_handle() const;
+
+ int load(const char *filename);
+ bool unload(int file_handle);
+
+ /*---------------------------------------------------------------------
+ * These malloc routines return a uint32_t instead of a pointer
+ * Because the target memory space is 32 bit and is independent of the
+ * size of a host pointer (ie. 32bit vs 64 bit)
+ * Device/Target global memory could be 36-bit.
+ * get_local_scratch returns max local free block for per kernel use.
+ *--------------------------------------------------------------------*/
+ DSPDevicePtr get_local_scratch(uint32_t &size, uint32_t &block_size);
+ DSPDevicePtr malloc_local (size_t size);
+ void free_local (DSPDevicePtr add);
+ DSPDevicePtr malloc_msmc (size_t size);
+ void free_msmc (DSPDevicePtr add);
+ DSPDevicePtr64 malloc_global(size_t size, bool prefer_32bit=true);
+ void free_global (DSPDevicePtr64 add);
+
+ void mail_to (Msg_t& msg);
+ bool mail_query();
+ int mail_from ();
+
+ void push_complete_pending(uint32_t idx, class Event* const data);
+ bool get_complete_pending(uint32_t idx, class Event* &data);
+ void dump_complete_pending();
+ bool any_complete_pending();
+ bool gotEnoughToWorkOn();
+
+ std::string builtinsHeader(void) const { return "dsp.h"; }
+
+ DSPDevicePtr get_addr_kernel_config() { return p_addr_kernel_config; }
+ void* get_mpax_default_res();
+
+ private:
+ unsigned int p_cores;
+ unsigned int p_num_events;
+ float p_dsp_mhz;
+ pthread_t p_worker;
+ void* p_rx_mbox; // int
+ void* p_tx_mbox;
+ std::list<Event *> p_events;
+ pthread_cond_t p_events_cond;
+ pthread_mutex_t p_events_mutex;
+ bool p_stop;
+ bool p_initialized;
+ unsigned char p_dsp_id;
+ dspheap p_device_ddr_heap1; // persistently mapped memory
+ dspheap p_device_ddr_heap2; // ondemand mapped memory
+ dspheap p_device_ddr_heap3; // addl ondemand mapped memory
+ dspheap p_device_l2_heap;
+ dspheap p_device_msmc_heap;
+ DLOAD_HANDLE p_dload_handle;
+ concurrent_map<uint32_t, class Event*> p_complete_pending;
+
+ DSPDevicePtr p_addr_kernel_config;
+ DSPDevicePtr64 p_addr64_global_mem;
+ DSPDevicePtr p_addr_local_mem;
+ DSPDevicePtr p_addr_msmc_mem;
+ DSPDevicePtr p_addr_mbox_d2h_phys;
+ DSPDevicePtr p_addr_mbox_h2d_phys;
+ uint64_t p_size64_global_mem;
+ uint32_t p_size_local_mem;
+ uint32_t p_size_msmc_mem;
+ uint32_t p_size_mbox_d2h;
+ uint32_t p_size_mbox_h2d;
+ void* p_mpax_default_res;
+};
+}
+#endif
diff --git a/src/core/dsp/driver.cpp b/src/core/dsp/driver.cpp
new file mode 100644
index 0000000..08e97f7
--- /dev/null
+++ b/src/core/dsp/driver.cpp
@@ -0,0 +1,34 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifdef DSPC868X
+#include "driver_shannon.cpp"
+#include "cmem.cpp"
+#else
+#include "driver_hawking.cpp"
+#include "shmem.cpp"
+#endif
diff --git a/src/core/dsp/driver.h b/src/core/dsp/driver.h
new file mode 100644
index 0000000..1e41a28
--- /dev/null
+++ b/src/core/dsp/driver.h
@@ -0,0 +1,100 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _DRIVER_H
+#define _DRIVER_H
+#include <vector>
+#include "u_lockable.h"
+#include "device.h"
+
+#ifdef DSPC868X
+extern "C"
+{
+ #include "pciedrv.h"
+ #include "dnldmgr.h"
+ #include "cmem_drv.h"
+ #include "bufmgr.h"
+}
+#else
+#include "shmem.h"
+#endif
+
+class Driver : public Lockable_off
+{
+ public:
+ ~Driver() { close(); }
+ int32_t num_dsps() const { return pNum_dsps; }
+ int32_t close();
+
+ int32_t write(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz);
+ int32_t read (int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf, uint32_t sz);
+
+ void* reset_and_load (int chip);
+ void free_image_handle(void *handle);
+ void cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3);
+ void cmem_exit();
+ DSPDevicePtr64 cmem_ondemand_malloc(uint64_t size);
+ void cmem_ondemand_free (DSPDevicePtr64 addr);
+ void split_ddr_memory (DSPDevicePtr64 addr, uint64_t size,
+ DSPDevicePtr64& addr1, uint64_t& size1,
+ DSPDevicePtr64& addr2, uint64_t& size2,
+ uint64_t& size3);
+ void shmem_configure (DSPDevicePtr64 addr, uint64_t size,
+ int cmem_block = -1);
+ void* map (DSPDevicePtr64 addr, uint32_t sz,
+ bool is_read = false);
+ int32_t unmap (void *host_addr, DSPDevicePtr64 buf_addr,
+ uint32_t sz, bool is_write = false);
+ DSPDevicePtr get_symbol(void* image_handle, const char *name);
+
+ static Driver* instance ();
+
+ private:
+ static Driver* pInstance;
+ int32_t pNum_dsps;
+
+#ifdef DSPC868X
+ pciedrv_open_config_t config;
+ pciedrv_device_info_t *pDevices_info;
+#else
+ std::vector<shmem*> pShmem_areas;
+ shmem* get_memory_region(DSPDevicePtr64 addr);
+#endif
+
+ int32_t open ();
+ bool wait_for_ready(int chip);
+ int32_t write_core(int32_t dsp, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t sz);
+
+ Driver() { open(); }
+ Driver(const Driver&); // copy ctor disallowed
+ Driver& operator=(const Driver&); // assignment disallowed
+};
+
+#endif // _DRIVER_H
diff --git a/src/core/dsp/driver_hawking.cpp b/src/core/dsp/driver_hawking.cpp
new file mode 100644
index 0000000..7cb2857
--- /dev/null
+++ b/src/core/dsp/driver_hawking.cpp
@@ -0,0 +1,451 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "driver.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+#include <bfd.h>
+
+extern "C"
+{
+ #include "mpmclient.h"
+};
+
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC
+#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC)
+
+Driver* Driver::pInstance = 0;
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Driver* Driver::instance ()
+{
+ static Mutex Driver_instance_mutex;
+ Driver* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Driver_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Driver;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Convert pci data into a recognizable board name for a device
+******************************************************************************/
+const char *get_board(unsigned switch_device)
+{
+ switch (switch_device)
+ {
+ case 0x8624: return "dspc8681";
+ case 0x8748: return "dspc8682";
+ default : ERR(1, "Unsupported device"); return "unknown";
+ }
+}
+
+#define TOTAL_NUM_CORES_PER_CHIP 8
+
+/******************************************************************************
+* wait_for_ready
+******************************************************************************/
+bool Driver::wait_for_ready(int chip) { return true; }
+
+static void report_core_state(const char *curr_core)
+{
+#if 0
+ char state[50];
+ int ret;
+ mpm_slave_state_e core_state;
+
+ ret = mpm_state(curr_core, &core_state);
+ if ( ret < 0)
+ printf("state query failed, %s\n", curr_core);
+
+ switch (core_state)
+ {
+ case mpm_slave_state_idle: sprintf(state, "idle"); break;
+ case mpm_slave_state_loaded: sprintf(state, "loaded"); break;
+ case mpm_slave_state_running: sprintf(state, "running"); break;
+ case mpm_slave_state_crashed: sprintf(state, "crashed"); break;
+ case mpm_slave_state_error: sprintf(state, "in error"); break;
+
+ default: sprintf(state, "in undefined state"); break;
+ }
+
+ printf("DSP core state: %s is %s\n", curr_core, state);
+#endif
+}
+
+void *Driver::reset_and_load(int chip)
+{
+ int ret;
+ int error_code = 0;
+ int error_code_msg[50];
+ char curr_core[10];
+
+ std::string get_ocl_dsp();
+ std::string monitor = get_ocl_dsp() + "/dsp.out";
+
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5, "dsp%d", core);
+
+ ret = mpm_reset(curr_core, &error_code);
+ if ( ret < 0)
+ printf("reset failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+// JKN Update ERR to handle error_code
+ ERR (ret, "DSP out of reset failed");
+
+ report_core_state(curr_core);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Load monitor on the devices
+ *------------------------------------------------------------------------*/
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5,"dsp%d", core);
+ ret = mpm_load(curr_core, const_cast<char*>(monitor.c_str()),
+ &error_code);
+ if ( ret < 0)
+ printf("load failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+ ERR(ret, "Download image failed");
+
+ report_core_state(curr_core);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Run monitor on the devices
+ *------------------------------------------------------------------------*/
+ for (int core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ snprintf(curr_core, 5,"dsp%d", core);
+ ret = mpm_run(curr_core, &error_code);
+ if ( ret < 0)
+ printf("run failed, core %d (retval: %d, error: %d)\n",
+ core, ret, error_code);
+ ERR(ret, "DSP run failed");
+
+ report_core_state(curr_core);
+ }
+
+ bfd *dsp_bfd = bfd_openr(monitor.c_str(), NULL);
+ char** matching;
+ char *ptr;
+
+ if(dsp_bfd == NULL)
+ {
+ printf("\nERROR:driver: %s Error Open image %s\n",
+ bfd_errmsg(bfd_get_error()), monitor.c_str());
+ exit(-1);
+ }
+ /* Check format with matching */
+ if (!bfd_check_format_matches (dsp_bfd, bfd_object, &matching))
+ {
+ fprintf(stderr, "\nERROR:driver %s: %s\n", monitor.c_str(),
+ bfd_errmsg(bfd_get_error()));
+ if (bfd_get_error () == bfd_error_file_ambiguously_recognized)
+ {
+ for (ptr = *matching; ptr != NULL; ptr++)
+ {
+ printf("%s: \n", ptr);
+ exit(-1);
+ }
+ free (matching);
+ }
+ }
+
+ return (void *)dsp_bfd;
+}
+
+/******************************************************************************
+* Driver::open
+******************************************************************************/
+int32_t Driver::open()
+{
+ Lock lock(this);
+
+ pNum_dsps = 1;
+
+ return 0;
+}
+
+/******************************************************************************
+* Driver::close()
+******************************************************************************/
+int32_t Driver::close()
+{
+ Lock lock(this);
+
+ while (!pShmem_areas.empty()) delete pShmem_areas.back(), pShmem_areas.pop_back();
+
+ cmem_exit();
+ return 0;
+}
+
+void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3)
+{
+ shmem_cmem_persistent::cmem_init(addr1, size1, addr2, size2, addr3, size3);
+}
+
+void Driver::cmem_exit()
+{
+ shmem_cmem_persistent::cmem_exit();
+}
+
+DSPDevicePtr64 Driver::cmem_ondemand_malloc(uint64_t size)
+{
+ return shmem_cmem_ondemand::cmem_malloc(size);
+}
+
+void Driver::cmem_ondemand_free(DSPDevicePtr64 addr)
+{
+ shmem_cmem_ondemand::cmem_free(addr);
+}
+
+/******************************************************************************
+* Driver::split_ddr_heap: partition DDR to persistent mapping part (heap1)
+* and on demand mapping part (heap2)
+******************************************************************************/
+void Driver::split_ddr_memory(DSPDevicePtr64 addr, uint64_t size,
+ DSPDevicePtr64& addr1, uint64_t& size1,
+ DSPDevicePtr64& addr2, uint64_t& size2,
+ uint64_t& size3)
+{
+ addr1 = addr;
+ size1 = size;
+ addr2 = 0;
+ size2 = 0;
+
+
+ // split ddr memory 1 into two chunks
+ if (getenv("TI_OCL_DSP_NOMAP") != NULL)
+ {
+ size3 = 0;
+ }
+ else if (addr + size > ALL_PERSISTENT_MAX_DSP_ADDR ||
+ (size3 > 0 && addr + size > MPAX_USER_MAPPED_DSP_ADDR))
+ {
+ size2 = addr + size - MPAX_USER_MAPPED_DSP_ADDR;
+ size1 = size - size2;
+ addr2 = addr + size1;
+ }
+
+ // translate first chunk to using 32-bit aliased physical addresses
+ if (addr > DSP_36BIT_ADDR)
+ {
+ addr1 = addr + 0xA0000000 - 0x820000000ULL;
+ /*---------------------------------------------------------------------
+ * if the ddr size is greater than we can currently support, limit it
+ *--------------------------------------------------------------------*/
+ //const int ddr_size_limit = (1.5 * 1024*1024*1024) - (48 *1024*1024);
+ const uint64_t ddr_size_limit = ALL_PERSISTENT_MAX_DSP_ADDR - addr;
+ if (size1 > ddr_size_limit)
+ size1 = ddr_size_limit;
+ }
+}
+
+void Driver::shmem_configure(DSPDevicePtr64 addr, uint64_t size, int cmem_block)
+{
+ if (size <= 0) return;
+
+ shmem *area;
+ if (addr >= MPAX_USER_MAPPED_DSP_ADDR)
+ area = new shmem_cmem_ondemand();
+ else if (cmem_block >= 0)
+ area = new shmem_cmem_persistent(cmem_block);
+ else
+ area = new shmem_persistent();
+
+ area->configure(addr, size);
+ pShmem_areas.push_back(area);
+}
+
+/******************************************************************************
+* Driver::get_memory_region
+******************************************************************************/
+shmem* Driver::get_memory_region(DSPDevicePtr64 addr)
+{
+
+ for (int i = 0; i < pShmem_areas.size(); ++i)
+ {
+ uint64_t end_exclusive = (uint64_t)pShmem_areas[i]->start() +
+ pShmem_areas[i]->size();
+
+ if (addr >= pShmem_areas[i]->start() && addr < end_exclusive)
+ return pShmem_areas[i];
+ }
+
+ printf("Illegal memory region: addr = 0x%llx\n", addr);
+ exit(-1);
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ int core;
+ /*-------------------------------------------------------------------------
+ * if the write is to L2, then write for each core
+ *------------------------------------------------------------------------*/
+ if ((addr >> 20) == 0x008)
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size);
+ else write_core(dsp_id, addr, buf, size);
+}
+
+/******************************************************************************
+* Driver::write_core
+******************************************************************************/
+int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ Lock lock(this);
+
+ shmem* region = get_memory_region(addr);
+ void* dst_host_addr = region->map(addr, size, false);
+ if (dst_host_addr) memcpy((char*)dst_host_addr, buf, size);
+ else ERR(1, "Unable to map dsp addr for write");
+ region->unmap(dst_host_addr, size, true);
+
+ return 0;
+}
+
+void* Driver::map(DSPDevicePtr64 addr, uint32_t sz, bool is_read)
+{
+ Lock lock(this);
+ shmem* region = get_memory_region(addr);
+ void* host_addr = region->map(addr, sz, is_read);
+ if (host_addr == NULL) ERR(1, "Unable to map a dsp address");
+ return host_addr;
+}
+
+int32_t Driver::unmap(void *host_addr, DSPDevicePtr64 buf_addr, uint32_t sz,
+ bool is_write)
+{
+ Lock lock(this);
+ shmem* region = get_memory_region(buf_addr);
+ region->unmap(host_addr, sz, is_write);
+ return 0;
+}
+
+/******************************************************************************
+* Driver::read
+******************************************************************************/
+int32_t Driver::read(int32_t dsp_id, DSPDevicePtr64 addr, uint8_t *buf,
+ uint32_t size)
+{
+ Lock lock(this);
+
+ shmem* region = get_memory_region(addr);
+ void* dst_host_addr = region->map(addr, size, true);
+ if (dst_host_addr) memcpy(buf, (char*)dst_host_addr, size);
+ else ERR(1, "Unable to map dsp addr for read");
+ region->unmap(dst_host_addr, size, false);
+
+ return 0;
+}
+
+/******************************************************************************
+* Driver::free_image_handle
+******************************************************************************/
+void Driver::free_image_handle(void *handle)
+{
+ bfd_close((bfd*)handle);
+}
+
+/******************************************************************************
+* Driver::get_symbol
+******************************************************************************/
+DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name)
+{
+ DSPDevicePtr addr;
+ bfd* dsp_bfd;
+ uint32_t nsyms, nsize;
+ asymbol ** symtab;
+ symbol_info syminfo;
+ int i;
+
+ if (!image_handle)
+ {
+ std::cout << "ERROR: Failed to get image handle" << std::endl;
+ exit(-1);
+ }
+
+ dsp_bfd = (bfd *)image_handle;
+
+ /*-------------------------------------------------------------------------
+ * Find boot address and address of mpi_rank.
+ *------------------------------------------------------------------------*/
+ nsize = bfd_get_symtab_upper_bound (dsp_bfd);
+ if ((symtab = (asymbol**)malloc(nsize)) == NULL)
+ {
+ std::cout << "ERROR: Failed to malloc memory in get_symbol" << std::endl;
+ exit(-1);
+ }
+
+ nsyms = bfd_canonicalize_symtab(dsp_bfd, symtab);
+
+ for (i = 0; i < nsyms; i++)
+ if (strcmp(symtab[i]->name, name) == 0)
+ {
+ bfd_symbol_info(symtab[i], &syminfo);
+ DSPDevicePtr addr = syminfo.value;
+ free(symtab);
+
+ return addr;
+ }
+
+ free(symtab);
+ std::cout << "ERROR: Get symbol failed" << std::endl;
+ exit(-1);
+}
diff --git a/src/core/dsp/driver_shannon.cpp b/src/core/dsp/driver_shannon.cpp
new file mode 100644
index 0000000..b428dbb
--- /dev/null
+++ b/src/core/dsp/driver_shannon.cpp
@@ -0,0 +1,313 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "driver.h"
+#include "cmem.h"
+#include <deque>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <sys/stat.h>
+#include <string>
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+#define BOOT_ENTRY_LOCATION_ADDR 0x87FFFC
+#define BOOT_MAGIC_ADDR(core) (0x10000000 | (core << 24) | 0x87FFFC)
+
+Driver* Driver::pInstance = 0;
+
+/******************************************************************************
+* Thread safe instance function for singleton behavior
+******************************************************************************/
+Driver* Driver::instance ()
+{
+ static Mutex Driver_instance_mutex;
+ Driver* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Driver_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Driver;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+}
+
+/******************************************************************************
+* Convert pci data into a recognizable board name for a device
+******************************************************************************/
+const char *get_board(unsigned switch_device)
+{
+ switch (switch_device)
+ {
+ case 0x8624: return "dspc8681";
+ case 0x8748: return "dspc8682";
+ default : ERR(1, "Unsupported device"); return "unknown";
+ }
+}
+
+#define TOTAL_NUM_CORES_PER_CHIP 8
+
+/******************************************************************************
+* wait_for_ready
+******************************************************************************/
+bool Driver::wait_for_ready(int chip)
+{
+ int execution_wait_count = 0;
+ while (1)
+ {
+ int core;
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ {
+ uint32_t boot_entry_value;
+ int ret = pciedrv_dsp_read(chip,
+ ((0x10 + core) << 24) + BOOT_ENTRY_LOCATION_ADDR,
+ (unsigned char *) &boot_entry_value, 4);
+ ERR(ret, "pciedrv_dsp_read failed");
+
+ if (boot_entry_value != 0) break;
+ }
+
+ if (core == TOTAL_NUM_CORES_PER_CHIP) return true;
+ if (++execution_wait_count > 1000) return false;
+
+ usleep(1000);
+ }
+}
+
+char *get_ocl_install();
+void *Driver::reset_and_load(int chip)
+{
+ char *installation = get_ocl_install();
+
+ /*------------------------------------------------------------------------
+ * Determine DSP speed. 1 Ghz by default. Set Env Var for 1.25Ghz Oper
+ *-----------------------------------------------------------------------*/
+ uint32_t pll_multiplier = 0x00000014; // 1.00 Ghz by default
+ if (getenv("TI_OCL_DSP_1_25GHZ")) pll_multiplier = 0x00000019;
+
+ /*-------------------------------------------------------------------------
+ * Configure boot config
+ *------------------------------------------------------------------------*/
+ uint32_t bootcfg_words[]= { 0xBABEFACE, pll_multiplier };
+ boot_cfg_t bootcfg = { 0x86FF00, sizeof(bootcfg_words), bootcfg_words};
+
+ /*-------------------------------------------------------------------------
+ * reset the devices
+ *------------------------------------------------------------------------*/
+ int ret = dnldmgr_reset_dsp(chip, 0, NULL, 0 , NULL);
+ ERR (ret, "DSP putting in reset failed");
+
+ const char *board = get_board(pDevices_info[chip].switch_device);
+ std::string init(installation);
+ init += "/lib/init_";
+ init += board;
+ init += ".out";
+
+ void * image_handle;
+ uint32_t entry;
+
+ ret = dnldmgr_get_image(init.c_str(), &image_handle, &entry);
+ ERR(ret, "Get reset image failed");
+
+ ret = dnldmgr_reset_dsp(chip, 1, image_handle, entry, &bootcfg);
+ ERR (ret, "DSP out of reset failed");
+
+ dnldmgr_free_image(image_handle);
+
+ /*---------------------------------------------------------------------
+ * wait for reset to complete
+ *--------------------------------------------------------------------*/
+ ERR(!wait_for_ready(chip), "Reset Failed due to timeout");
+
+ /*-------------------------------------------------------------------------
+ * Load monitor on the devices
+ *------------------------------------------------------------------------*/
+ std::string monitor(installation);
+ monitor += "/lib/dsp.out";
+
+ ret = dnldmgr_get_image(monitor.c_str(), &image_handle, &entry);
+ ERR(ret, "Get DSP image failed");
+
+ ret = dnldmgr_load_image(chip, 0xFFFF, image_handle, entry, NULL);
+ ERR(ret, "Download image failed");
+
+ return image_handle;
+}
+
+/******************************************************************************
+* Driver::open
+******************************************************************************/
+int32_t Driver::open()
+{
+ Lock lock(this);
+
+ memset((void*)&config, 0, sizeof(pciedrv_open_config_t));
+ config.dsp_outbound_reserved_mem_size = 0;
+ config.start_dma_chan_num = 0;
+ config.num_dma_channels = 4;
+ config.start_param_set_num = 0;
+ config.num_param_sets = 32;
+ config.dsp_outbound_block_size = 0x400000;
+ config.max_dma_transactions = 256;
+
+ int status = pciedrv_open(&config);
+ ERR(status, "PCIe Driver Open Error");
+
+ pNum_dsps = pciedrv_get_num_devices();
+
+ /*-------------------------------------------------------------------------
+ * Allocate space for and retrieve device info
+ *------------------------------------------------------------------------*/
+ pDevices_info = (pciedrv_device_info_t*)
+ malloc(pNum_dsps * sizeof(pciedrv_device_info_t));
+ ERR (!pDevices_info, "malloc failed pciedrv_devices_info_t");
+
+ int ret = pciedrv_get_pci_info(pDevices_info);
+ ERR(ret, "get pci info failed");
+
+ Cmem::instance(); // Prime the setup of cmem
+ return 0;
+}
+
+/******************************************************************************
+* Driver::close()
+******************************************************************************/
+int32_t Driver::close()
+{
+ Lock lock(this);
+ free (pDevices_info);
+ int status = pciedrv_close();
+ ERR(status, "PCIe Driver Close Error");
+ return 0;
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ int core;
+ /*-------------------------------------------------------------------------
+ * if the write is to L2, then write for each core
+ *------------------------------------------------------------------------*/
+ if ((addr >> 20) == 0x008)
+ for (core=0; core< TOTAL_NUM_CORES_PER_CHIP; core++)
+ write_core(dsp_id, ((0x10 + core) << 24) + addr, buf, size);
+ else write_core(dsp_id, addr, buf, size);
+}
+
+
+/******************************************************************************
+* Driver::write
+******************************************************************************/
+int32_t Driver::write_core(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ /*-------------------------------------------------------------------------
+ * Regular writes under 24k are faster than DMA writes (may change)
+ *------------------------------------------------------------------------*/
+ if (size < 24 * 1024)
+ {
+ int status = pciedrv_dsp_write(dsp_id, addr, buf, size);
+ ERR(status, "PCIe Driver Write Error");
+ return 0;
+ }
+
+ Lock lock(this);
+ Cmem::instance()->dma_write(dsp_id, addr, buf, size);
+ return 0;
+}
+
+void* Driver::map(DSPDevicePtr addr, uint32_t sz, bool is_read)
+{
+ return (void*) (uint64_t) addr;
+}
+
+int32_t Driver::unmap(void *host_addr, DSPDevicePtr buf_addr,
+ uint32_t sz, bool is_write)
+{
+}
+
+/******************************************************************************
+* Driver::read
+******************************************************************************/
+int32_t Driver::read(int32_t dsp_id, DSPDevicePtr addr, uint8_t *buf,
+ uint32_t size)
+{
+ Cmem::instance()->dma_read(dsp_id, addr, buf, size);
+ return 0;
+}
+
+/******************************************************************************
+* Driver::get_symbol
+******************************************************************************/
+DSPDevicePtr Driver::get_symbol(void* image_handle, const char *name)
+{
+ DSPDevicePtr addr;
+ int ret = dnldmgr_get_symbol_address(image_handle, name, &addr);
+ if (ret) { printf("ERROR: Get symbol failed\n"); exit(-1); }
+
+ return addr;
+}
+
+/******************************************************************************
+* Driver::free_image_handle
+******************************************************************************/
+void Driver::free_image_handle(void *handle)
+{
+ dnldmgr_free_image(handle);
+}
+
+/******************************************************************************
+* Driver::cmem_setup
+* Driver::shmem_configure
+******************************************************************************/
+void Driver::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2)
+{
+}
+
+void Driver::cmem_exit()
+{
+}
+
+void Driver::shmem_configure(DSPDevicePtr addr, uint32_t size, int cmem_block)
+{
+}
+
diff --git a/src/core/dsp/dspheap.h b/src/core/dsp/dspheap.h
new file mode 100644
index 0000000..0668647
--- /dev/null
+++ b/src/core/dsp/dspheap.h
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+* @file dspheap.h
+*
+* @brief Define a dsp device heap manager run on the host.
+*
+* @version 1.00.00
+*
+******************************************************************************/
+#ifndef _DSPHEAP_H
+#define _DSPHEAP_H
+#include <map>
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include "u_lockable.h"
+#include "dspmem.h"
+
+#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1))
+#define MIN_BLOCK_SIZE 128
+#define MIN_CMEM_ONDEMAND_BLOCK_SIZE 4096
+
+class dspheap : public Lockable
+{
+ typedef std::map<DSPDevicePtr64, uint64_t> block_list;
+ typedef block_list::iterator block_iter;
+ typedef block_list::value_type block_descriptor;
+
+ public:
+ dspheap(DSPDevicePtr64 start_addr, uint64_t length)
+ {
+ configure(start_addr, length);
+ }
+
+ dspheap() { }
+
+ void configure(DSPDevicePtr64 start_addr, uint64_t length,
+ bool is_cmem_ondemand_heap = false)
+ {
+ /*---------------------------------------------------------------------
+ * Ensure that the start_addr and length are multiples of 16M.
+ * 16M is the granularity of a memory region that can be controlled
+ * by a MAR register of C6x.
+ *--------------------------------------------------------------------*/
+ //assert((length & 0xFFFFFF) == 0);
+ //assert(((uint32_t)start_addr & 0xFFFFFF) == 0);
+
+ p_start_addr = start_addr;
+ p_length = length;
+ p_block_size = is_cmem_ondemand_heap ? MIN_CMEM_ONDEMAND_BLOCK_SIZE
+ : MIN_BLOCK_SIZE;
+
+ Lock lock(this);
+ if (free_list.empty())
+ free_list[start_addr] = length;
+ }
+
+ ~dspheap() { }
+
+ DSPDevicePtr64 malloc(uint32_t size, bool allow_fail=false)
+ {
+ size = min_block_size(size);
+
+ Lock lock(this);
+ for (block_iter it = free_list.begin(); it != free_list.end(); ++it)
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if (block_size >= size)
+ {
+ free_list.erase(it);
+ alloc_list[block_addr] = size;
+
+ /*-------------------------------------------------------------
+ * if we only use a portion of the free block
+ *------------------------------------------------------------*/
+ if (block_size > size)
+ free_list[(DSPDevicePtr64)block_addr+size] = block_size-size;
+
+ return block_addr;
+ }
+ }
+
+ if (!allow_fail)
+ {
+ printf("Malloc failed for size 0x%x from range (0x%08llx, 0x%08llx)\n",
+ size, p_start_addr, p_start_addr+p_length-1);
+ abort();
+ }
+
+ return 0;
+ }
+
+ int free(DSPDevicePtr64 addr)
+ {
+ /*---------------------------------------------------------------------
+ * Nothing to do if not an allocated address
+ *--------------------------------------------------------------------*/
+ Lock lock(this);
+ block_iter it = alloc_list.find(addr);
+ if (it == alloc_list.end()) return -1;
+
+ uint32_t size = (*it).second;
+ alloc_list.erase(it);
+
+ /*---------------------------------------------------------------------
+ * Merge the block with neighboring free blocks
+ *--------------------------------------------------------------------*/
+ it = free_list.begin();
+ while (it != free_list.end())
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if ( block_addr + block_size == addr
+ || addr + size == block_addr)
+ {
+ block_iter merge_it = it;
+ if (block_addr < addr) addr = block_addr;
+ size = block_size + size;
+ ++it;
+ free_list.erase(merge_it);
+ continue;
+ }
+ ++it;
+ }
+ free_list[addr] = size;
+ return 0;
+ }
+
+ DSPDevicePtr64 size() const { return p_length; }
+
+ DSPDevicePtr64 max_block_size(uint64_t &size, uint32_t &block_size)
+ {
+ if (p_length < p_block_size)
+ {
+ block_size = p_block_size;
+ size = 0;
+ return 0;
+ }
+
+ DSPDevicePtr64 max_block_addr = 0;
+ uint64_t max_block_size = p_block_size;
+
+ Lock lock(this);
+ for (block_iter it = free_list.begin(); it != free_list.end(); ++it)
+ {
+ DSPDevicePtr64 block_addr = (*it).first;
+ uint64_t block_size = (*it).second;
+
+ if (block_size >= max_block_size)
+ {
+ max_block_addr = block_addr;
+ max_block_size = block_size;
+ }
+ }
+
+ block_size = p_block_size;
+ size = max_block_size;
+ return max_block_addr;
+ }
+
+ private:
+ block_list free_list;
+ block_list alloc_list;
+ DSPDevicePtr64 p_start_addr;
+ uint64_t p_length;
+ uint32_t p_block_size;
+
+ uint32_t min_block_size(uint32_t size) { return ROUNDUP(size, p_block_size); }
+};
+
+#endif // _DSPHEAP_H
diff --git a/src/core/dsp/dspmem.h b/src/core/dsp/dspmem.h
new file mode 100644
index 0000000..f6c7c64
--- /dev/null
+++ b/src/core/dsp/dspmem.h
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * Copyright (c) 2013, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdint.h>
+#ifndef _DSPMEM_H
+#define _DSPMEM_H
+
+
+typedef uint32_t DSPDevicePtr;
+typedef uint64_t DSPDevicePtr64;
+typedef uint32_t DSPVirtPtr;
+// typedef uint64_t DSPVirtPtr64; // for future C7x?
+
+/*****************************************************************************
+ * DSP Device Memory Physical Addreess (8GB)
+ * 0x8:0000_0000 - 0x8:1FFF_FFFF: Linux reserved
+ * 0x8:2000_0000 - 0x8:21FF_FFFF: OCL runtime reserved
+ * using default MPAX translation, map to
+ * DSP virtual address 0xA000_0000 - 0xA1FF_FFFF
+ * 0x8:2200_0000 - 0x8:3FFF_FFFF: using default MPAX translation, map to
+ * DSP virtual address 0xA200_0000 - 0xBFFF_FFFF
+ * used for kernel code, user app small buffers
+ * 0x8:4000_0000 - 0x9:FFFF_FFFF: using custom MPAX translation settings, map
+ * to unused DSP virtual address spaces
+ * used for user app big buffers
+ *****************************************************************************/
+#define DSP_36BIT_ADDR 0x800000000ULL
+#define MPAX_USER_MAPPED_DSP_ADDR 0x840000000ULL
+#define ALL_PERSISTENT_MAX_DSP_ADDR 0x880000000ULL
+
+#define MSMC_OCL_START_ADDR 0x0C040000
+#define MSMC_OCL_END_ADDR 0x0C500000
+
+
+#endif // _DSPMEM_H
diff --git a/src/core/dsp/genfile_cache.cpp b/src/core/dsp/genfile_cache.cpp
new file mode 100644
index 0000000..c9b2472
--- /dev/null
+++ b/src/core/dsp/genfile_cache.cpp
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "genfile_cache.h"
+
+std::string genfile_cache::lookup(llvm::Module *module, std::string options)
+{
+ std::vector<std::vector<std::string> > result;
+ uint32_t hash = convert_mod2crc(module, options);
+
+ std::string query("select value from programs where hash = " +
+ boost::lexical_cast<std::string>(hash));
+
+ result = p_database.query(query.c_str());
+
+ if (!result.empty())
+ {
+ string &filename = result[0][0];
+
+ struct stat statbuf;
+ if (stat(filename.c_str(), &statbuf) == 0)
+ return filename;
+ /*-----------------------------------------------------------------
+ * if (the cached filename no longer exists, remove it from the DB
+ *----------------------------------------------------------------*/
+ else
+ {
+ std::string q2("delete from programs where hash = " +
+ boost::lexical_cast<std::string>(hash));
+
+ p_database.query(q2.c_str());
+ return std::string();
+ }
+ }
+ else return std::string();
+}
+
+void genfile_cache::remember(const char *outfile, llvm::Module *module,
+ std::string options)
+{
+ uint32_t hash = convert_mod2crc(module, options);
+ std::string query("insert into programs(hash, value) values("
+ + boost::lexical_cast<std::string>(hash)
+ + ", \""
+ + string(outfile)
+ + "\");");
+
+ p_database.query(query.c_str());
+}
+
+uint32_t genfile_cache::convert_mod2crc(llvm::Module *module,
+ std::string options)
+{
+ string llvm_ir;
+
+ llvm::raw_string_ostream ostream(llvm_ir);
+ llvm::WriteBitcodeToFile(module, ostream);
+ ostream.str();
+
+ llvm_ir += options;
+
+ return get_crc(llvm_ir);
+}
+
+uint32_t genfile_cache::get_crc(std::string& my_string)
+{
+ boost::crc_32_type result;
+ result.process_bytes(my_string.data(), my_string.length());
+ return result.checksum();
+}
diff --git a/src/core/dsp/genfile_cache.h b/src/core/dsp/genfile_cache.h
new file mode 100644
index 0000000..46b27f2
--- /dev/null
+++ b/src/core/dsp/genfile_cache.h
@@ -0,0 +1,101 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _genfile_cache_
+#define _genfile_cache_
+
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/crc.hpp>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <stdint.h>
+#include "u_locks_pthread.h"
+#include "database.h"
+
+class genfile_cache
+{
+ public:
+ std::string lookup (llvm::Module *module, std::string options);
+ void remember (const char *outfile, llvm::Module *module,
+ std::string options);
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static genfile_cache* instance ()
+ {
+ static Mutex Cache_instance_mutex;
+ genfile_cache* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cache_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ char *user = getenv("USER");
+ tmp = new genfile_cache("/tmp/opencl_ofdb_" + string(user));
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+
+ private:
+ static genfile_cache* pInstance;
+ std::string p_dbname;
+ Database p_database;
+
+ private:
+ genfile_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str())
+ {
+ p_database.query("create table if not exists "
+ "programs(hash integer, value string);");
+ }
+
+ uint32_t convert_mod2crc (llvm::Module *module, std::string options);
+ uint32_t get_crc (std::string& my_string);
+
+ genfile_cache(const genfile_cache&); // copy ctor disallowed
+ genfile_cache& operator=(const genfile_cache&); // assignment disallowed
+};
+
+#endif // _genfile_cache_
diff --git a/src/core/dsp/kernel.cpp b/src/core/dsp/kernel.cpp
new file mode 100644
index 0000000..291673a
--- /dev/null
+++ b/src/core/dsp/kernel.cpp
@@ -0,0 +1,718 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "kernel.h"
+#include "device.h"
+#include "buffer.h"
+#include "program.h"
+#include "utils.h"
+#include "u_locks_pthread.h"
+#include "mailbox.h"
+
+#include "../kernel.h"
+#include "../memobject.h"
+#include "../events.h"
+#include "../program.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern "C"
+{
+ #include <ti/runtime/mmap/include/mmap_resource.h>
+}
+
+
+#define ROUNDUP(val, pow2) (((val) + (pow2) - 1) & ~((pow2) - 1))
+#define QERR(msg, retcode) do {if (getenv("TI_OCL_VERBOSE_ERROR")) std::cerr << msg << std::endl; return retcode; } while(0)
+#define ERR(x) std::cerr << x << std::endl
+#define ERROR() std::cerr << "Unknown error in dsp/kernel.cpp" << std::endl
+
+using namespace Coal;
+
+DSPKernel::DSPKernel(DSPDevice *device, Kernel *kernel)
+: DeviceKernel(), p_device(device), p_kernel(kernel),
+ p_device_entry_pt((DSPDevicePtr)0),
+ p_data_page_ptr ((DSPDevicePtr)0xffffffff)
+{
+}
+
+DSPKernel::~DSPKernel()
+{
+}
+
+
+template<typename T>
+T k_exp(T base, unsigned int e)
+{
+ T rs = base;
+ for (unsigned int i=1; i<e; ++i) rs *= base;
+ return rs;
+}
+
+/*-----------------------------------------------------------------------------
+* This and the next function are called from the multiple worker threads. They
+* may all enter the set the name section, but they will all set the same value,
+* so even though there is a race, there is no race error. when work group
+* division is pushed down to the dsp, the race will go away.
+*----------------------------------------------------------------------------*/
+DSPDevicePtr DSPKernel::device_entry_pt()
+{
+ if (!p_device_entry_pt)
+ {
+ size_t name_length;
+ p_kernel->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length);
+
+ void *name = malloc(name_length);
+ p_kernel->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0);
+
+ Program *p = (Program *)p_kernel->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ if (!prog->is_loaded()) ERROR();
+ p_device_entry_pt = prog->query_symbol((char*)name);
+ free (name);
+ }
+ return p_device_entry_pt;
+}
+
+/******************************************************************************
+* The data page pointer can frequently be 0, so we will initialize it to be
+* 0xffffffff as a start value instead of 0.
+******************************************************************************/
+DSPDevicePtr DSPKernel::data_page_ptr()
+{
+ if (p_data_page_ptr == (DSPDevicePtr)0xffffffff)
+ {
+ Program *p = (Program *)p_kernel->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ if (!prog->is_loaded()) ERROR();
+ //p_data_page_ptr = prog->query_symbol("__TI_STATIC_BASE");
+ p_data_page_ptr = prog->data_page_ptr();
+ }
+ return p_data_page_ptr;
+}
+
+/******************************************************************************
+* void DSPKernel::preAllocBuffers()
+******************************************************************************/
+cl_int DSPKernel::preAllocBuffers()
+{
+ for (unsigned int i=0; i < kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg &arg = kernel()->arg(i);
+
+ if (arg.kind() == Kernel::Arg::Buffer &&
+ arg.file() != Kernel::Arg::Local)
+ {
+ MemObject *buffer = *(MemObject **)arg.data();
+ if (buffer && !buffer->allocate(device()))
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ }
+ }
+ return CL_SUCCESS;
+}
+
+
+/******************************************************************************
+* Try to find the size a work group needs to be executed the fastest on the DSP.
+******************************************************************************/
+size_t DSPKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const
+{
+ // ASW TODO - what the ????
+ unsigned int dsps = p_device->numDSPs();
+
+ /*-------------------------------------------------------------------------
+ * Don't break in too small parts
+ *------------------------------------------------------------------------*/
+ if (k_exp(global_work_size, num_dims) > 64)
+ return global_work_size;
+
+ /*-------------------------------------------------------------------------
+ * Find the divisor of global_work_size the closest to dsps but >= than it
+ *------------------------------------------------------------------------*/
+ unsigned int divisor = dsps <= 0 ? 1 : dsps;
+
+ while (true)
+ {
+ if ((global_work_size % divisor) == 0)
+ break;
+
+ /*---------------------------------------------------------------------
+ * Don't let the loop go up to global_work_size, the overhead would be
+ * too huge
+ *--------------------------------------------------------------------*/
+ if (divisor > global_work_size || divisor > dsps * 32)
+ {
+ divisor = 1; // Not parallel but has no CommandQueue overhead
+ break;
+ }
+
+ divisor -= 1;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Return the size
+ *------------------------------------------------------------------------*/
+ return global_work_size / divisor;
+}
+
+/******************************************************************************
+* localMemSize()
+******************************************************************************/
+cl_ulong DSPKernel::localMemSize() const
+{
+ cl_ulong local_mem = 0;
+
+ for (int i = 0; i < kernel()->numArgs(); ++i)
+ {
+ const Kernel::Arg &arg = kernel()->arg(i);
+
+ if (arg.kind() == Kernel::Arg::Buffer &&
+ arg.file() == Kernel::Arg::Local)
+ local_mem += arg.allocAtKernelRuntime();
+ }
+
+ return local_mem;
+}
+
+Kernel * DSPKernel::kernel() const { return p_kernel; }
+DSPDevice * DSPKernel::device() const { return p_device; }
+
+// From Wikipedia : http://www.wikipedia.org/wiki/Power_of_two#Algorithm_to_round_up_to_power_of_two
+template <class T>
+T next_power_of_two(T k)
+{
+ if (k == 0) return 1;
+
+ k--;
+ for (int i=1; i<sizeof(T)*8; i<<=1)
+ k = k | k >> i;
+ return k+1;
+}
+
+size_t DSPKernel::typeOffset(size_t &offset, size_t type_len)
+{
+ size_t rs = offset;
+
+ // Align offset to stype_len
+ type_len = next_power_of_two(type_len);
+ if (type_len > 8) type_len = 8; // The c66 has no alignment need > 8 bytes
+
+ size_t mask = ~(type_len - 1);
+
+ while (rs & mask != rs)
+ rs++;
+
+ // Where to try to place the next value
+ offset = rs + type_len;
+
+ return rs;
+}
+
+static int kernelID = 0;
+
+/*=============================================================================
+* DSPKernelEvent
+*============================================================================*/
+DSPKernelEvent::DSPKernelEvent(DSPDevice *device, KernelEvent *event)
+: p_device(device), p_event(event), p_kernel((DSPKernel*)event->deviceKernel()),
+ p_kernel_id(kernelID++), p_debug_kernel(false), p_num_arg_words(0),
+ p_WG_alloca_start(0)
+{
+ char *dbg = getenv("TI_OCL_DEBUG_KERNEL");
+ if (dbg) p_debug_kernel = true;
+
+ callArgs(MAX_ARG_BUF_SIZE);
+}
+
+DSPKernelEvent::~DSPKernelEvent() { }
+
+#define READ_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_READ_ONLY)
+#define WRITE_ONLY_BUFFER(buffer) (buffer->flags() & CL_MEM_WRITE_ONLY)
+
+#define SETARG(val) if (arg_words < args_in_mem_size) args_in_mem[arg_words++] = val; \
+ else std::cerr << "To many argument bytes are needed" << std::endl
+
+#define SETMOREARG(sz, pval) do \
+ { \
+ more_arg_offset = ROUNDUP(more_arg_offset, sz); \
+ if (ROUNDUP(more_arg_offset + sz, 8) > sizeof(p_msg.u.k.flush.buffers))\
+ std::cerr << "Too many arguments, does not fit" << std::endl; \
+ memcpy(more_args_in_mem+more_arg_offset, pval, sz); \
+ more_arg_offset += sz; \
+ } while(0)
+
+//#define SETMOREARG(sz,psrc)
+
+/******************************************************************************
+* DSPKernelEvent::callArgs
+******************************************************************************/
+void DSPKernelEvent::callArgs(unsigned args_in_mem_size)
+{
+ int arg_words = 0;
+ unsigned *args_in_mem = (unsigned*)p_msg.u.k.kernel.argBuf;
+ char *more_args_in_mem = (char *)p_msg.u.k.flush.buffers;
+ int more_arg_offset = 4;
+ bool is_more_arg = false;
+
+ /*-------------------------------------------------------------------------
+ * Write Arguments
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < p_kernel->kernel()->numArgs(); ++i)
+ {
+ is_more_arg = (i >= 10);
+
+ const Kernel::Arg & arg = p_kernel->kernel()->arg(i);
+ size_t size = arg.valueSize() * arg.vecDim();
+
+ if (size == 0) ERR("Kernel Argument has size == 0");
+ if (size != 1 && size != 2 && size != 4 && size != 8)
+ ERR("Invalid Kernel Argument size");
+
+ /*---------------------------------------------------------------------
+ * We may have to perform some changes in the values (buffers, etc)
+ *--------------------------------------------------------------------*/
+ switch (arg.kind())
+ {
+ case Kernel::Arg::Buffer:
+ {
+ MemObject *buffer = 0;
+ DSPDevicePtr buf_ptr = 0;
+ if (arg.data()) buffer = *(MemObject **)arg.data();
+ if (!is_more_arg) SETARG(sizeof(DSPVirtPtr));
+
+ DSPVirtPtr *buf_dspvirtptr = (!is_more_arg) ?
+ (&args_in_mem[arg_words]) :
+ (DSPVirtPtr *)(more_args_in_mem+ROUNDUP(more_arg_offset,4));
+
+ /*-------------------------------------------------------------
+ * Alloc a buffer and pass it to the kernel
+ *------------------------------------------------------------*/
+ if (arg.file() == Kernel::Arg::Local)
+ {
+ uint32_t lbufsz = arg.allocAtKernelRuntime();
+ p_local_bufs.push_back(LocalPair(buf_dspvirtptr, lbufsz));
+
+ /*-----------------------------------------------------
+ * Since the only reader and writer of local memory (L2)
+ * will be the core itself, I do not believe we need
+ * to flush local buffers for correctness.
+ *----------------------------------------------------*/
+ //p_flush_bufs->push_back(DSPMemRange(lbuf, lbufsz));
+ }
+ else if (buffer != NULL)
+ {
+ /*---------------------------------------------------------
+ * Get the DSP buffer, allocate it and get its pointer
+ *--------------------------------------------------------*/
+ if (buffer->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ p_hostptr_tmpbufs.push_back(
+ HostptrPair(buffer, DSPPtrPair(0, buf_dspvirtptr)));
+ }
+ else
+ {
+ DSPBuffer *dspbuf = (DSPBuffer *)buffer->deviceBuffer(p_device);
+ buffer->allocate(p_device);
+ DSPDevicePtr64 addr64 = dspbuf->data();
+ if (addr64 < 0xFFFFFFFF)
+ buf_ptr = addr64;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ addr64, buf_dspvirtptr), buffer->size()));
+
+ if (! WRITE_ONLY_BUFFER(buffer))
+ p_flush_bufs.push_back(DSPMemRange(DSPPtrPair(
+ addr64, buf_dspvirtptr), buffer->size()));
+ }
+ }
+
+ /*---------------------------------------------------------
+ * Use 0 for local buffer address here, it will be overwritten
+ * with allocated local buffer address at kernel dispatch time.
+ * Same for allocating temporary buffer for use_host_ptr.
+ *--------------------------------------------------------*/
+ if (!is_more_arg) SETARG(buf_ptr);
+ else { SETMOREARG(4, &buf_ptr); }
+
+ break;
+ }
+
+ case Kernel::Arg::Image2D:
+ case Kernel::Arg::Image3D: ERR("Images not yet supported"); break;
+
+ /*-----------------------------------------------------------------
+ * Non-Buffers
+ *----------------------------------------------------------------*/
+ default:
+ if (!is_more_arg)
+ {
+ SETARG((size < 4 ? 4 : size));
+ // Cast to (int) to avoid a codegen bug
+ // ZEXT will happen in LLVM and ICODE, so don't worry
+ if (size == 1) SETARG(((int) *((signed char*)arg.data())));
+ else if (size == 2) SETARG(((int) *((short*)arg.data())));
+ else SETARG(*((unsigned*) arg.data()));
+ if (size == 8) { SETARG(*(((unsigned*)arg.data()) + 1)); }
+ }
+ else { SETMOREARG(size, arg.data()); }
+ break;
+ }
+ }
+ SETARG(0); // 0 terminator for args area
+
+ p_num_arg_words = arg_words;
+ p_msg.u.k.flush.sizeMoreArgs = (more_arg_offset > 4) ?
+ ROUNDUP(more_arg_offset, 8) : 0;
+}
+
+/******************************************************************************
+* debug_pause
+******************************************************************************/
+static void debug_pause(uint32_t entry, uint32_t dsp_id,
+ const char* outfile, char *name)
+{
+ printf("[OCL] Launching kernel %s on DSP %d\n", name, dsp_id);
+ printf("[OCL] Connect debugger and set breakpoint at 0x%08x\n", entry);
+ printf("[OCL] Load symbols from file %s\n", outfile);
+ printf("[OCL] Press any key, then enter to continue\n");
+ do { char t; std::cin >> t; } while(0);
+}
+
+
+
+/******************************************************************************
+* bool DSPKernelEvent::run()
+******************************************************************************/
+cl_int DSPKernelEvent::run(Event::Type evtype)
+{
+ Program *p = (Program *)p_kernel->kernel()->parent();
+ DSPProgram *prog = (DSPProgram *)(p->deviceDependentProgram(p_device));
+
+ // TODO perhaps ensure that prog is loaded.
+
+ int dim = p_event->work_dim();
+
+ /*-------------------------------------------------------------------------
+ * Create a message for the DSP
+ *------------------------------------------------------------------------*/
+ Msg_t &msg = p_msg;
+ kernel_config_t *cfg = &msg.u.k.kernel.config;
+
+ if (evtype == Event::TaskKernel)
+ {
+ msg.command = TASK;
+ cfg->Kernel_id = p_kernel_id;
+
+ CommandQueue *q = (CommandQueue *) p_event->parent();
+ cl_command_queue_properties q_prop = 0;
+ q->info(CL_QUEUE_PROPERTIES, sizeof(q_prop), &q_prop, NULL);
+ cfg->global_sz_0 = (q_prop & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) ?
+ OUT_OF_ORDER_TASK_SIZE : IN_ORDER_TASK_SIZE;
+ cfg->local_sz_0 = 1;
+ cfg->local_sz_1 = 1;
+ cfg->local_sz_2 = 1;
+ }
+ else
+ {
+ msg.command = NDRKERNEL;
+
+ cfg->num_dims = dim;
+ cfg->global_sz_0 = p_event->global_work_size(0);
+ cfg->global_sz_1 = dim > 1 ? p_event->global_work_size(1) : 1;
+ cfg->global_sz_2 = dim > 2 ? p_event->global_work_size(2) : 1;
+ cfg->local_sz_0 = p_event->local_work_size(0);
+ cfg->local_sz_1 = dim > 1 ? p_event->local_work_size(1) : 1;
+ cfg->local_sz_2 = dim > 2 ? p_event->local_work_size(2) : 1;
+ cfg->global_off_0 = p_event->global_work_offset(0);
+ cfg->global_off_1 = p_event->global_work_offset(1);
+ cfg->global_off_2 = p_event->global_work_offset(2);
+ cfg->WG_gid_start_0 = 0;
+ cfg->WG_gid_start_1 = 0;
+ cfg->WG_gid_start_2 = 0;
+ cfg->Kernel_id = p_kernel_id;
+ cfg->WG_id = 0;
+ cfg->stats = 0;
+ }
+
+ msg.u.k.kernel.entry_point = (unsigned)p_kernel->device_entry_pt();
+ msg.u.k.kernel.data_page_ptr = (unsigned)p_kernel->data_page_ptr();
+
+ /*-------------------------------------------------------------------------
+ * Allocating local buffer in L2 per kernel run instance
+ *------------------------------------------------------------------------*/
+ uint32_t total_sz, block_sz;
+ DSPDevicePtr local_scratch = p_device->get_local_scratch(total_sz, block_sz);
+ for (size_t i = 0; i < p_local_bufs.size(); ++i)
+ {
+ DSPVirtPtr *p_arg_word = p_local_bufs[i].first;
+ unsigned local_buf_size = p_local_bufs[i].second;
+
+ uint32_t rounded_sz = ROUNDUP(local_buf_size, block_sz);
+ if (rounded_sz > total_sz)
+ {
+ QERR("Total local buffer size exceeds available local size",
+ CL_MEM_OBJECT_ALLOCATION_FAILURE);
+ }
+ *p_arg_word = local_scratch;
+ local_scratch += rounded_sz;
+ total_sz -= rounded_sz;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Allocating temporary space in global memory for kernel alloca'ed data
+ *------------------------------------------------------------------------*/
+#define NUM_CORES_PER_CHIP 8
+ cfg->WG_alloca_size = p_kernel->kernel()->get_wi_alloca_size() *
+ cfg->local_sz_0 * cfg->local_sz_1 * cfg->local_sz_2;
+ if (cfg->WG_alloca_size > 0)
+ {
+ cfg->WG_alloca_size += 4096; // 4K bytes padding between WGs' allocas
+ uint32_t chip_alloca_size = cfg->WG_alloca_size * NUM_CORES_PER_CHIP;
+ p_WG_alloca_start = p_device->malloc_global( // malloc abort if fail
+ chip_alloca_size, true);
+ if (!p_WG_alloca_start)
+ {
+ QERR("Alloca size exceeds available global memory",
+ CL_OUT_OF_RESOURCES);
+ }
+
+ if (p_WG_alloca_start < 0xFFFFFFFF)
+ cfg->WG_alloca_start = (DSPVirtPtr) p_WG_alloca_start;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ p_WG_alloca_start, &cfg->WG_alloca_start), chip_alloca_size));
+ }
+
+ /*-------------------------------------------------------------------------
+ * Allocating temporary global buffer for use_host_ptr
+ *------------------------------------------------------------------------*/
+ for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i)
+ {
+ MemObject *buffer = p_hostptr_tmpbufs[i].first;
+ DSPDevicePtr64 *p_addr64 = &p_hostptr_tmpbufs[i].second.first;
+ DSPVirtPtr *p_arg_word = p_hostptr_tmpbufs[i].second.second;
+
+ *p_addr64 = p_device->malloc_global(buffer->size(), false);
+
+ if (!p_addr64)
+ {
+ QERR("Temporary memory for CL_MEM_USE_HOST_PTR buffer exceeds available global memory",
+ CL_MEM_OBJECT_ALLOCATION_FAILURE);
+ }
+
+ if (*p_addr64 < 0xFFFFFFFF)
+ *p_arg_word = *p_addr64;
+ else
+ p_64bit_bufs.push_back(DSPMemRange(DSPPtrPair(
+ *p_addr64, p_arg_word), buffer->size()));
+
+ if (! WRITE_ONLY_BUFFER(buffer))
+ {
+ void *mapped_tmpbuf = Driver::instance()->map(*p_addr64,
+ buffer->size(), false);
+ memcpy(mapped_tmpbuf, buffer->host_ptr(), buffer->size());
+ p_flush_bufs.push_back(DSPMemRange(DSPPtrPair(
+ *p_addr64, p_arg_word), buffer->size()));
+ Driver::instance()->unmap(mapped_tmpbuf, *p_addr64,
+ buffer->size(), true);
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Compute MPAX mappings from DSPDevicePtr64 to DSPVirtPtr in p_64bit_bufs
+ *------------------------------------------------------------------------*/
+ msg.u.k.flush.num_mpaxs = 0;
+ uint32_t num_64bit_bufs = p_64bit_bufs.size();
+ if (num_64bit_bufs > 0)
+ {
+ uint64_t *phys_addrs = new uint64_t[num_64bit_bufs];
+ uint32_t *lengths = new uint32_t[num_64bit_bufs];
+ uint32_t *prots = new uint32_t[num_64bit_bufs];
+ uint32_t *virt_addrs = new uint32_t[num_64bit_bufs];
+ for (int i = 0; i < p_64bit_bufs.size(); ++i)
+ {
+ phys_addrs[i] = p_64bit_bufs[i].first.first;
+ lengths[i] = p_64bit_bufs[i].second;
+ prots[i] = 0; // don't care yet
+ }
+
+ keystone_mmap_resources_t mpax_res;
+ memcpy(&mpax_res, p_device->get_mpax_default_res(),
+ sizeof(keystone_mmap_resources_t));
+ if (keystone_mmap_resource_alloc(num_64bit_bufs, phys_addrs, lengths,
+ prots, virt_addrs, &mpax_res) != KEYSTONE_MMAP_RESOURCE_NOERR)
+ {
+ QERR("MPAX allocation failed!",
+ CL_OUT_OF_RESOURCES);
+ }
+
+ // set the MPAX settings in the message
+ uint32_t mpax_used = 0;
+ for (; mpax_res.mapping[mpax_used].segsize_power2 > 0; mpax_used += 1)
+ {
+ msg.u.k.flush.mpax_settings[2*mpax_used ] = (uint32_t)
+ (mpax_res.mapping[mpax_used].raddr >> 12); // e.g. 0x822004
+ msg.u.k.flush.mpax_settings[2*mpax_used+1] = // e.g. 0xC000000D
+ mpax_res.mapping[mpax_used].baddr
+ | (mpax_res.mapping[mpax_used].segsize_power2-1);
+ }
+ msg.u.k.flush.num_mpaxs = mpax_used;
+
+ // set the virtual address in arguments
+ for (int i = 0; i < p_64bit_bufs.size(); ++i)
+ {
+ *(p_64bit_bufs[i].first.second) = virt_addrs[i];
+ if (p_debug_kernel)
+ printf("Virtual = 0x%x, physical = 0x%llx\n",
+ virt_addrs[i], p_64bit_bufs[i].first.first);
+ }
+ delete [] phys_addrs;
+ delete [] lengths;
+ delete [] prots;
+ delete [] virt_addrs;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Helpful information for debugging a kernel
+ *------------------------------------------------------------------------*/
+ if (p_debug_kernel)
+ {
+ for (int i = 0; i < msg.u.k.flush.num_mpaxs; i++)
+ printf("mpax %d: l=0x%x, h=0x%x\n", i,
+ msg.u.k.flush.mpax_settings[2*i],
+ msg.u.k.flush.mpax_settings[2*i+1]);
+
+ uint32_t *args = msg.u.k.kernel.argBuf;
+ int arg_num = 1;
+ // TODO: print more args properly
+ for (int i=0; i < p_num_arg_words; i++)
+ {
+ if (args[i] == 4)
+ {
+ i++;
+ printf("[OCL] Kernel argument %d = 0x%08x\n", arg_num, args[i]);
+ }
+ else if (args[i] == 8)
+ {
+ printf("[OCL] Kernel argument %d = 0x%08x 0x%08x\n",
+ arg_num, args[i+1], args[i+2]);
+ i+=2;
+ }
+ arg_num++;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Make sure we do not overflow the number of commands a mailbox can handle
+ *------------------------------------------------------------------------*/
+ if (p_flush_bufs.size() > MAX_KERNEL_ARGUMENTS)
+ {
+ QERR("To many buffers to flush", CL_OUT_OF_RESOURCES);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Populate Flush commands for any buffers that are read by the DSP
+ *------------------------------------------------------------------------*/
+ msg.u.k.flush.numBuffers = p_flush_bufs.size();
+
+#if 0 // YUAN: flush buffers used for more arguments (for now)
+ for (int i=0; i < p_flush_bufs.size(); ++i)
+ {
+ msg.u.k.flush.buffers[2*i] = p_flush_bufs[i].first;
+ msg.u.k.flush.buffers[2*i+1] = p_flush_bufs[i].second;
+ }
+#endif
+
+ /*-------------------------------------------------------------------------
+ * Feedback to user for debug
+ *------------------------------------------------------------------------*/
+ if (p_debug_kernel)
+ {
+ size_t name_length;
+ p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, 0, 0, &name_length);
+ char *name = (char*)malloc(name_length);
+ if (!name) return CL_OUT_OF_HOST_MEMORY;
+ p_kernel->kernel()->info(CL_KERNEL_FUNCTION_NAME, name_length, name, 0);
+
+ debug_pause(p_kernel->device_entry_pt(), p_device->dspID(),
+ prog->outfile_name(), name);
+ free (name);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Dispatch the commands through the mailbox
+ *------------------------------------------------------------------------*/
+ p_device->mail_to(msg);
+
+ /*-------------------------------------------------------------------------
+ * Do not wait for completion
+ *------------------------------------------------------------------------*/
+ return CL_SUCCESS;
+}
+
+/******************************************************************************
+* free_tmp_bufs allocated for kernel allocas, and for use_host_ptr
+******************************************************************************/
+void DSPKernelEvent::free_tmp_bufs()
+{
+ if (p_WG_alloca_start > 0)
+ p_device->free_global(p_WG_alloca_start);
+
+ for (int i = 0; i < p_hostptr_tmpbufs.size(); ++i)
+ {
+ MemObject *buffer = p_hostptr_tmpbufs[i].first;
+ DSPDevicePtr64 addr64 = p_hostptr_tmpbufs[i].second.first;
+
+ if (! READ_ONLY_BUFFER(buffer))
+ {
+ void *mapped_tmpbuf = Driver::instance()->map(addr64,
+ buffer->size(), true);
+ memcpy(buffer->host_ptr(), mapped_tmpbuf, buffer->size());
+ Driver::instance()->unmap(mapped_tmpbuf, addr64,
+ buffer->size(), false);
+ }
+ p_device->free_global(addr64);
+ }
+
+}
+
diff --git a/src/core/dsp/kernel.h b/src/core/dsp/kernel.h
new file mode 100644
index 0000000..850941d
--- /dev/null
+++ b/src/core/dsp/kernel.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_KERNEL_H__
+#define __DSP_KERNEL_H__
+
+#include "../events.h"
+#include "../memobject.h"
+#include "../deviceinterface.h"
+#include "message.h"
+#include "device.h"
+#include <core/config.h>
+
+#include <vector>
+#include <string>
+#include <pthread.h>
+#include <stdint.h>
+
+namespace llvm
+{
+ class Function;
+}
+
+typedef std::pair<DSPDevicePtr64, DSPVirtPtr *> DSPPtrPair;
+typedef std::pair<DSPPtrPair, uint32_t> DSPMemRange;
+typedef std::pair<DSPVirtPtr *, uint32_t> LocalPair;
+typedef std::pair<Coal::MemObject *, DSPPtrPair> HostptrPair;
+
+
+namespace Coal
+{
+class DSPDevice;
+class Kernel;
+class KernelEvent;
+
+class DSPKernel : public DeviceKernel
+{
+ public:
+ DSPKernel(DSPDevice *device, Kernel *kernel);
+ ~DSPKernel();
+
+ size_t workGroupSize() const { return 128; }
+ cl_ulong localMemSize() const ;
+ cl_ulong privateMemSize() const { return 0; }
+ size_t preferredWorkGroupSizeMultiple() const { return 0; }
+
+ size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
+ size_t global_work_size) const;
+ DSPDevicePtr device_entry_pt();
+ DSPDevicePtr data_page_ptr();
+ cl_int preAllocBuffers();
+
+ Kernel * kernel() const;
+ DSPDevice * device() const;
+
+ llvm::Function *function() const;
+ static size_t typeOffset(size_t &offset, size_t type_len);
+
+ private:
+ DSPDevice * p_device;
+ Kernel * p_kernel;
+ DSPDevicePtr p_device_entry_pt;
+ DSPDevicePtr p_data_page_ptr;
+};
+
+class DSPKernelEvent
+{
+ public:
+ DSPKernelEvent (DSPDevice *device, KernelEvent *event);
+ ~DSPKernelEvent ();
+
+ cl_int run (Event::Type evtype);
+ void callArgs (unsigned rs_size);
+
+ DSPDevice* device() { return p_device; }
+ uint32_t kernel_id() { return p_kernel_id; }
+
+ void free_tmp_bufs();
+
+ private:
+ DSPDevice * p_device;
+ KernelEvent * p_event;
+ DSPKernel * p_kernel;
+ uint32_t p_kernel_id;
+ bool p_debug_kernel;
+ int p_num_arg_words;
+ Msg_t p_msg;
+ DSPDevicePtr64 p_WG_alloca_start;
+ std::vector<DSPMemRange> p_flush_bufs;
+ std::vector<LocalPair> p_local_bufs;
+ std::vector<HostptrPair> p_hostptr_tmpbufs;
+ std::vector<DSPMemRange> p_64bit_bufs;
+};
+}
+#endif
diff --git a/src/core/dsp/mailbox.h b/src/core/dsp/mailbox.h
new file mode 100644
index 0000000..f87c08c
--- /dev/null
+++ b/src/core/dsp/mailbox.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _MAILBOX_H_
+#define _MAILBOX_H_
+#include "u_locks_pthread.h"
+#include "driver.h"
+
+extern "C"
+{
+ #include "mpm_mailbox.h"
+}
+
+class Mailbox
+{
+ public:
+
+ int32_t create(void* mbox_handle, char *slave_node_name,
+ uint32_t mem_location, uint32_t direction,
+ mpm_mailbox_config_t *mbox_config)
+ {
+ int32_t result = mpm_mailbox_create(mbox_handle, slave_node_name,
+ mem_location, direction, mbox_config);
+ return result;
+ }
+
+ int32_t open(void* mbox_handle)
+ {
+ int32_t result = mpm_mailbox_open(mbox_handle);
+ return result;
+ }
+
+ int32_t write (void* mbox_handle, uint8_t *buf, uint32_t size,
+ uint32_t trans_id)
+ {
+ int result;
+
+ do result = mpm_mailbox_write (mbox_handle, buf, size, trans_id);
+ while (result == MPM_MAILBOX_ERR_MAIL_BOX_FULL);
+
+ return true;
+ }
+
+ int32_t read (void* mbox_handle, uint8_t *buf, uint32_t *size,
+ uint32_t *trans_id)
+ {
+ int32_t result = mpm_mailbox_read (mbox_handle, buf, size, trans_id);
+ return result;
+ }
+
+ int32_t query (void* mbox_handle)
+ {
+ int32_t result = mpm_mailbox_query (mbox_handle);
+ return result;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static Mailbox* instance ()
+ {
+ static Mutex Mailbox_instance_mutex;
+ Mailbox* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Mailbox_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ tmp = new Mailbox;
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+ private:
+ static Mailbox* pInstance;
+
+ Mailbox() { } // ctor private
+ Mailbox(const Mailbox&); // copy ctor disallowed
+ Mailbox& operator=(const Mailbox&); // assignment disallowed
+};
+
+#endif // _MAILBOX_H_
diff --git a/src/core/dsp/memmap.h b/src/core/dsp/memmap.h
new file mode 100644
index 0000000..503540e
--- /dev/null
+++ b/src/core/dsp/memmap.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+class DSP_MappedMem
+{
+ public:
+ DSP_MappedMem(uint32_t dsp_id, uint32_t size)
+ : p_size(size), p_dsp_id(dsp_id), p_dsp_addr(0)
+ p_num_buffers(CEIL_DIVIDE(size, HOST_CMEM_BUFFER_SIZE))
+ {
+ p_buffers = new [p_num_buffers] cmem_host_buf_desc_t;
+ ERR(!p_buffers, "Cannot allocate host memory for a DSP Mapped Region");
+
+ int status
+ for (int i = 0; i< num_buffers; i++)
+ {
+ status = bufmgrAlloc(DmaBufPool, 1, &p_buffers[i]);
+ ERR(status, "Cannot allocate CMEM pool for a DSP Mapped Region");
+ }
+
+ /*---------------------------------------------------------------------
+ * Allocate DSP range
+ *--------------------------------------------------------------------*/
+ status = pciedrv_dsp_memrange_alloc(dsp_id, size, p_dsp_addr);
+ ERR(status, "PCIe driver dsp memrange alloc failed");
+
+ /*---------------------------------------------------------------------
+ * Map Input buffers to dsp range
+ *--------------------------------------------------------------------*/
+ status = pciedrv_map_bufs_to_dsp_memrange(dsp_id, num_buffers,
+ p_buffers, (uint32_t) p_dsp_addr);
+ ERR(status, "PCIe driver dsp map bufs to memrange failed");
+ }
+
+ ~DSP_MappedMem()
+ {
+ /*---------------------------------------------------------------------
+ * Free DSP range
+ *--------------------------------------------------------------------*/
+ int status = pciedrv_dsp_memrange_free(dsp_id, size, p_dsp_addr);
+ ERR(status, "PCIe driver dsp memrange free failed");
+
+ for (int i = 0; i< num_buffers; i++)
+ {
+ status = bufmgrFreeDesc(DmaBufPool, &p_buffers[i]);
+ ERR(status, "Cannot free CMEM pool for a DSP Mapped Region");
+ }
+
+ delete [p_num_buffers] p_buffers;
+ }
+
+ void copy_in(void* p, uint32_t size)
+ {
+ ERR(size > p_size, "DSP Mapped region input overflow");
+
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+
+ for (int i = 0; remaining_size; i++)
+ {
+ int chunk_size = std::min(remaining_size, p_buffers[i].length);
+
+ memcpy(p_buffers[i].user_addr, p + offset, chunk_size);
+
+ remaining_size -= chunk_size;
+ offset += chunk_size;
+ }
+ }
+
+ void copy_out(void* p, uint32_t size)
+ {
+ ERR(size > p_size, "DSP Mapped region output underrflow");
+
+ uint32_t remaining_size = size;
+ uint32_t offset = 0;
+
+ for (int i = 0; remaining_size; i++)
+ {
+ int chunk_size = std::min(remaining_size, p_buffers[i].length);
+
+ memcpy(p + offset, p_buffers[i].user_addr, chunk_size);
+
+ remaining_size -= chunk_size;
+ offset += chunk_size;
+ }
+ }
+
+ private:
+ uint32_t p_size;
+ uint32_t p_dsp_id;
+ uint32_t p_dsp_addr;
+ uint32_t p_num_buffers;
+ cmem_host_buf_desc_t *p_buffers;
+};
diff --git a/src/core/dsp/message.h b/src/core/dsp/message.h
new file mode 100644
index 0000000..d93fe1e
--- /dev/null
+++ b/src/core/dsp/message.h
@@ -0,0 +1,115 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __MESSAGE_H_
+#define __MESSAGE_H_
+
+#include <stdint.h>
+
+typedef enum { READY, EXIT, TASK, NDRKERNEL, WORKGROUP, CACHEINV, FREQUENCY, SUCCESS, ERROR, PRINT } command_codes;
+
+#define MAX_KERNEL_ARGUMENTS 10
+#define MAX_ARG_BUF_SIZE (MAX_KERNEL_ARGUMENTS*3)+1
+#define MAX_FLUSH_BUF_SIZE (MAX_KERNEL_ARGUMENTS*2)
+
+#define MAX_XMCSES_MPAXS 7
+#define FIRST_FREE_XMC_MPAX 3 // XMC MPAXs available: 3 - F
+#define FIRST_FREE_SES_MPAX 1 // SES MPAXs available: 1 - 7
+
+/******************************************************************************
+* Need to ensure that the alignments and therefore the offsets of all fields
+* are consistent between the host and the device.
+******************************************************************************/
+typedef struct
+{
+ uint32_t num_dims;
+
+ uint32_t global_sz_0;
+ uint32_t global_sz_1;
+ uint32_t global_sz_2;
+ uint32_t local_sz_0;
+ uint32_t local_sz_1;
+ uint32_t local_sz_2;
+ uint32_t global_off_0;
+ uint32_t global_off_1;
+ uint32_t global_off_2;
+ uint32_t WG_gid_start_0;
+ uint32_t WG_gid_start_1;
+ uint32_t WG_gid_start_2;
+ uint32_t Kernel_id;
+ uint32_t WG_id;
+ uint32_t stats;
+ uint32_t WG_alloca_start;
+ uint32_t WG_alloca_size;
+} kernel_config_t;
+
+typedef struct
+{
+ uint8_t numBuffers;
+ uint8_t num_mpaxs; // TODO: XMC only mpax for kernel alloca memory
+ uint16_t sizeMoreArgs;
+ uint32_t buffers[MAX_FLUSH_BUF_SIZE];
+ uint32_t mpax_settings[2*MAX_XMCSES_MPAXS]; // (MPAXL, MPAXH) pair
+} flush_msg_t;
+
+typedef struct
+{
+ kernel_config_t config;
+ uint32_t entry_point;
+ uint32_t data_page_ptr;
+ uint32_t argBuf[MAX_ARG_BUF_SIZE]; // NULL size terminated
+} kernel_msg_t;
+
+typedef struct
+{
+ command_codes command;
+ union
+ {
+ struct
+ {
+ kernel_msg_t kernel;
+ flush_msg_t flush;
+ } k;
+ char message[sizeof(kernel_msg_t) + sizeof(flush_msg_t)];
+ } u;
+} Msg_t;
+
+static Msg_t exitMsg = {EXIT};
+static Msg_t successMsg = {SUCCESS};
+static Msg_t readyMsg = {READY};
+static Msg_t errorMsg = {ERROR};
+static Msg_t frequencyMsg = {FREQUENCY};
+// static far Msg_t printMsg = {PRINT}; // moved to L2 in monitor
+
+static const uint32_t mbox_payload = sizeof(Msg_t);
+
+#define MBOX_SIZE 0x2000
+
+#define IN_ORDER_TASK_SIZE 1
+#define OUT_OF_ORDER_TASK_SIZE (IN_ORDER_TASK_SIZE+1)
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c
new file mode 100644
index 0000000..545ba92
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.c
@@ -0,0 +1,200 @@
+/*
+* c60_dynamic.c
+*
+* C6x-specific dynamic loader functionality
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifdef C60_TARGET
+#include "c60_elf32.h"
+#include <inttypes.h>
+#include "dload.h"
+
+/*****************************************************************************/
+/* c60_process_dynamic_tag() */
+/* */
+/* Process C6x specific dynamic tags. */
+/*****************************************************************************/
+BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i)
+{
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_GSYM_OFFSET: Dynamic symbol table is partitioned into */
+ /* local and global symbols. This tag has the */
+ /* offset into the dynamic symbol table where */
+ /* the global symbol table starts. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_GSYM_OFFSET:
+ dyn_module->gsymtab_offset = dyn_module->dyntab[i].d_un.d_val;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found global symbol table: %d\n",
+ dyn_module->gsymtab_offset);
+#endif
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_GSTR_OFFSET: Contains the offset into the dynamic */
+ /* string table where the global symbol names */
+ /* start. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_GSTR_OFFSET:
+ dyn_module->gstrtab_offset = dyn_module->dyntab[i].d_un.d_val;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found global string table: %d\n",
+ dyn_module->gstrtab_offset);
+#endif
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_BASE: Contains address of DSBT in executable or */
+ /* shared object. */
+ /* We store the tag's location in the dynamic */
+ /* module object so that we can update it */
+ /* easily after the sections have been */
+ /* allocated (tag value is relocated). */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_BASE:
+ dyn_module->dsbt_base_tagidx = i;
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_INDEX: Contains specific request for a DSBT */
+ /* index. If this object module doesn't get */
+ /* the index it requested, then the load will */
+ /* fail (object module has already assumed */
+ /* that it got the DSBT index it asks for; */
+ /* references to the DSBT index will not have */
+ /* relocation entries associated with them). */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_INDEX:
+ dyn_module->dsbt_index = dyn_module->dyntab[i].d_un.d_val;
+ return TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_SIZE: Contains the size of the DSBT allocated for */
+ /* this object module. It must be big enough */
+ /* to hold the content of the master DSBT. */
+ /*------------------------------------------------------------------*/
+ case DT_C6000_DSBT_SIZE:
+ dyn_module->dsbt_size = dyn_module->dyntab[i].d_un.d_val;
+ return TRUE;
+
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLDYN_c60_relocate_dynamic_tag_info() */
+/* */
+/* Update any target specific dynamic tag values that are associated with */
+/* a section address. Return TRUE if the tag value is successfully */
+/* updated or if the tag is not associated with a section address, and */
+/* FALSE if we can't find the sectoin associated with the tag or if the */
+/* tag type is not recognized. */
+/* */
+/*****************************************************************************/
+BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i)
+{
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*---------------------------------------------------------------------*/
+ /* These tags do not point to sections. */
+ /*---------------------------------------------------------------------*/
+ case DT_C6000_GSYM_OFFSET:
+ case DT_C6000_GSTR_OFFSET:
+ case DT_C6000_DSBT_INDEX:
+ case DT_C6000_DSBT_SIZE:
+ return TRUE;
+
+ /*---------------------------------------------------------------------*/
+ /* DT_C6000_DSBT_BASE: This tag value provides the virtual address of */
+ /* the .dsbt section. We will go find the program */
+ /* header entry associated with the DSBT section */
+ /* and update this tag with the section's run */
+ /* address. */
+ /*---------------------------------------------------------------------*/
+ case DT_C6000_DSBT_BASE:
+ return DLIMP_update_dyntag_section_address(dyn_module, i);
+ }
+
+ DLIF_error(DLET_MISC, "Invalid dynamic tag encountered, %d\n",
+ (int)dyn_module->dyntab[i].d_tag);
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* c60_process_eiosabi() */
+/* */
+/* Process the EI_OSABI value. Verify that the OSABI is supported and set */
+/* any variables which depend on the OSABI. */
+/*****************************************************************************/
+BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module)
+{
+ uint8_t osabi = dyn_module->fhdr.e_ident[EI_OSABI];
+
+ if (dyn_module->relocatable)
+ {
+ /*-------------------------------------------------------------------*/
+ /* ELFOSABI_C6000_ELFABI - C6x Baremetal ABI */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_C6000_ELFABI)
+ return TRUE;
+
+#if 0
+ /*-------------------------------------------------------------------*/
+ /* ELFOSABI_C6000_LINUX - C6x Linux ABI */
+ /* presently unsupported */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_C6000_LINUX)
+ return TRUE;
+#endif
+ }
+ else
+ {
+ /*-------------------------------------------------------------------*/
+ /* Static executables should have an OSABI of NONE. */
+ /*-------------------------------------------------------------------*/
+ if (osabi == ELFOSABI_NONE)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h
new file mode 100644
index 0000000..da99604
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_dynamic.h
@@ -0,0 +1,53 @@
+/*
+* c60_dynamic.h
+*
+* Interface into C6x-specific dynamic loader functionality
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_C60_H
+#define DLOAD_C60_H
+
+#include "dload.h"
+
+BOOL DLDYN_c60_process_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i);
+BOOL DLDYN_c60_process_eiosabi(DLIMP_Dynamic_Module* dyn_module);
+BOOL DLDYN_c60_relocate_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module, int32_t i);
+
+#define T_INTSZ 32
+#define T_CHARSZ 8
+#define MEM_INC 8
+#define PTR_SZ 32
+
+#endif
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h
new file mode 100644
index 0000000..418db17
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_DYN/c60_elf32.h
@@ -0,0 +1,160 @@
+/*
+* c60_elf32.h
+*
+* C6x-specific data structures for 32-bit ELF object format files.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef C60_ELF32_H
+#define C60_ELF32_H
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* C6x specific EI_OSABI values */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ ELFOSABI_C6000_ELFABI = 64, /* C6X Baremetal OSABI */
+ ELFOSABI_C6000_LINUX = 65 /* C6X Linux OSABI */
+};
+
+/*---------------------------------------------------------------------------*/
+/* File Header Flags (value of "e_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EF_C6000_REL = 0x01 /* Contains static relocations. A ET_EXEC or */
+ /* ET_DYN file w/ this flag set can be */
+ /* treated as ET_REL during static linking. */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Types (value of "p_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PT_C6000_PHATTRS = 0x70000000 /* Extended Program Header Attributes*/
+};
+
+/*---------------------------------------------------------------------------*/
+/* C6x specific section types */
+/*---------------------------------------------------------------------------*/
+enum
+{
+
+ /*------------------------------------------------------------------------*/
+ /* Section types defined by the C6x ELFABI. */
+ /* Note: ABI defined section type should be named SHT_C6000_xxx */
+ /*------------------------------------------------------------------------*/
+ SHT_C6000_UNWIND = 0x70000001, /* Exception Index Table */
+ SHT_C6000_PREEMPTMAP = 0x70000002, /* Pre-emption Map */
+
+ SHT_C6000_ATTRIBUTES = 0x70000003, /* Obj File Compatability Attributes */
+
+ /*------------------------------------------------------------------------*/
+ /* The following section types are not part of C6x ABI. As per the ABI, */
+ /* the processor specific values not defined in the ABI are reserved for */
+ /* future use. Here we reserve the range 0x7F000000 through 0x7FFFFFFFF */
+ /* for the TI specific processor section types. */
+ /* Note: TI specific section type should be named SHT_TI_xxx */
+ /*------------------------------------------------------------------------*/
+ SHT_TI_ICODE = 0x7F000000, /* ICODE representation */
+ SHT_TI_XREF = 0x7F000001, /* Symbol cross reference */
+ SHT_TI_HANDLER = 0x7F000002, /* Handler function table */
+ SHT_TI_INITINFO = 0x7F000003, /* Info for C auto-init of variables */
+ SHT_TI_PHATTRS = 0x7F000004 /* Extended Program Header Attributes*/
+};
+
+/*****************************************************************************/
+/* C6x-Specific Dynamic Array Tags (C6x ELF ABI Section ??? - AEGUPD) */
+/* NOTE: */
+/* As per GABI a tag whose value is even number indicates a dynamic tag */
+/* that uses d_ptr. Odd number indicates the use of d_val or doesn't use */
+/* neither d_val nor d_ptr. */
+/*****************************************************************************/
+enum
+{
+ /*------------------------------------------------------------------------*/
+ /* OSABI specific tags: */
+ /* From 0x6000000D thru 0x6FFFF000 */
+ /*------------------------------------------------------------------------*/
+ DT_C6000_GSYM_OFFSET = 0x6000000D, /* d_val -- OSABI Specific -- */
+ DT_C6000_GSTR_OFFSET = 0x6000000F, /* d_val -- OSABI Specific -- */
+
+ /*------------------------------------------------------------------------*/
+ /* Processor specific tags: */
+ /* From 0x70000000 thru 0x7FFFFFFF */
+ /*------------------------------------------------------------------------*/
+ DT_C6000_DSBT_BASE = 0x70000000, /* d_ptr -- Platform Specific -- */
+ DT_C6000_DSBT_SIZE = 0x70000001, /* d_val -- Platform Specific -- */
+ DT_C6000_PREEMPTMAP = 0x70000002, /* d_ptr -- Platform Specific -- */
+ DT_C6000_DSBT_INDEX = 0x70000003 /* d_val -- Platform Specific -- */
+};
+
+/*---------------------------------------------------------------------------*/
+/* C6x Dynamic Relocation Types */
+/*---------------------------------------------------------------------------*/
+typedef enum
+{
+ R_C6000_NONE = 0,
+ R_C6000_ABS32 = 1,
+ R_C6000_ABS16 = 2,
+ R_C6000_ABS8 = 3,
+ R_C6000_PCR_S21 = 4,
+ R_C6000_PCR_S12 = 5,
+ R_C6000_PCR_S10 = 6,
+ R_C6000_PCR_S7 = 7,
+ R_C6000_ABS_S16 = 8,
+ R_C6000_ABS_L16 = 9,
+ R_C6000_ABS_H16 = 10,
+ R_C6000_SBR_U15_B = 11,
+ R_C6000_SBR_U15_H = 12,
+ R_C6000_SBR_U15_W = 13,
+ R_C6000_SBR_S16 = 14,
+ R_C6000_SBR_L16_B = 15,
+ R_C6000_SBR_L16_H = 16,
+ R_C6000_SBR_L16_W = 17,
+ R_C6000_SBR_H16_B = 18,
+ R_C6000_SBR_H16_H = 19,
+ R_C6000_SBR_H16_W = 20,
+ R_C6000_SBR_GOT_U15_W = 21,
+ R_C6000_SBR_GOT_L16_W = 22,
+ R_C6000_SBR_GOT_H16_W = 23,
+ R_C6000_DSBT_INDEX = 24,
+ R_C6000_PREL31 = 25,
+ R_C6000_COPY = 26
+}C60_RELOC_TYPE;
+
+#endif /* C60_ELF32_H */
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c
new file mode 100644
index 0000000..3c79e35
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.c
@@ -0,0 +1,1101 @@
+/*
+* c60_reloc.c
+*
+* Process C6x-specific dynamic relocations for core dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <limits.h>
+#include "relocate.h"
+#include "symtab.h"
+#include "c60_elf32.h"
+#include "dload_api.h"
+#include "util.h"
+#include "dload_endian.h"
+#include "c60_reloc.h"
+
+#define MASK(n,s) (((1 << n) - 1) << s)
+
+/*---------------------------------------------------------------------------*/
+/* C6x Relocations Supported */
+/* */
+/* See the C6000 ELF ABI Specification for more details. */
+/* */
+/* R_C6000_ABS32 | .field X,32 */
+/* R_C6000_ABS16 | .field X,16 */
+/* R_C6000_ABS8 | .field X,8 */
+/* R_C6000_PCR_S21 | B foo */
+/* CALLP foo, B3 */
+/* R_C6000_PCR_S12 | BNOP foo */
+/* R_C6000_PCR_S10 | BPOS foo, A10 */
+/* BDEC foo, A1 */
+/* R_C6000_PCR_S7 | ADDKPC foo, B3, 4 */
+/* R_C6000_ABS_S16 | MVK sym, A0 */
+/* R_C6000_ABS_L16 | MVKL sym, A0 */
+/* MVKLH sym, A0 */
+/* R_C6000_ABS_H16 | MVKH sym, A0 */
+/* R_C6000_SBR_U15_B | LDB *+B14(sym), A1 */
+/* ADDAB B14, sym, A1 */
+/* R_C6000_SBR_U15_H | LDH *+B14(sym), A1 */
+/* ADDAH B14, sym, A1 */
+/* R_C6000_SBR_U15_W | LDW *+B14(sym), A1 */
+/* ADDAW B14, sym, A1 */
+/* R_C6000_SBR_S16 | MVK sym-$bss, A0 */
+/* R_C6000_SBR_L16_B | MVKL (sym-$bss), A0 */
+/* R_C6000_SBR_L16_H | MVKL (sym-$bss)/2,A0 */
+/* R_C6000_SBR_L16_W | MVKL (sym-$bss)/4,A0 */
+/* R_C6000_SBR_H16_B | MVKH (sym-$bss), A0 */
+/* R_C6000_SBR_H16_H | MVKH (sym-$bss)/2,A0 */
+/* R_C6000_SBR_H16_W | MVKH (sym-$bss)/4,A0 */
+/* R_C6000_SBR_GOT_U15_W | LDW *+B14[GOT(sym)],A0 */
+/* R_C6000_SBR_GOT_L16_W | MVKL $DPR_GOT(sym), A0 */
+/* R_C6000_SBR_GOT_H16_W | MVKH $DPR_GOT(sym), A0 */
+/* R_C6000_DSBT_INDEX | LDW *+B14[$DSBT_index()], DP */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*****************************************************************************/
+/* WRITE_RELOC_R() - Perform a relocation into a buffered segment. */
+/*****************************************************************************/
+static void write_reloc_r(uint8_t* buffered_segment,
+ uint32_t segment_offset,
+ int r_type, uint32_t r)
+{
+ uint32_t* rel_field_ptr = (uint32_t*)(buffered_segment + segment_offset);
+
+#if LOADER_DEBUG
+ /*------------------------------------------------------------------------*/
+ /* Print some details about the relocation we are about to process. */
+ /*------------------------------------------------------------------------*/
+ if(debugging_on)
+ {
+ DLIF_trace("RWRT: segment_offset: %d\n", segment_offset);
+ DLIF_trace("RWRT: buffered_segment: 0x%x\n",
+ (uint32_t)buffered_segment);
+ DLIF_trace("RWRT: rel_field_ptr: 0x%x\n", (uint32_t)rel_field_ptr);
+ DLIF_trace("RWRT: result: 0x%x\n", r);
+ }
+#endif
+
+
+ /*------------------------------------------------------------------------*/
+ /* Given the relocation type, carry out relocation into a 4 byte packet */
+ /* within the buffered segment. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ case R_C6000_ABS32:
+ *rel_field_ptr = r;
+ break;
+ case R_C6000_PREL31:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(30,0)) | r;
+ break;
+ case R_C6000_ABS16:
+ *((uint16_t*)(buffered_segment + segment_offset)) = r;
+ break;
+ case R_C6000_ABS8:
+ *((uint8_t*)(buffered_segment + segment_offset)) = r;
+ break;
+ case R_C6000_PCR_S21:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(21,7)) | (r << 7);
+ break;
+ case R_C6000_PCR_S12:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(12,16)) | (r << 16);
+ break;
+ case R_C6000_PCR_S10:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(10,13)) | (r << 13);
+ break;
+ case R_C6000_PCR_S7:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(7,16)) | (r << 16);
+ break;
+
+ case R_C6000_ABS_S16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+ case R_C6000_ABS_L16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+ case R_C6000_ABS_H16:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+
+ case R_C6000_SBR_U15_B:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+ case R_C6000_SBR_U15_H:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+ case R_C6000_SBR_U15_W:
+ case R_C6000_DSBT_INDEX:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(15,8)) | (r << 8);
+ break;
+
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ *rel_field_ptr = (*rel_field_ptr & ~MASK(16,7)) | (r << 7);
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "write_reloc_r called with invalid relocation type!\n");
+ }
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("reloc_field 0x%x\n", *rel_field_ptr);
+#endif
+}
+
+/*****************************************************************************/
+/* PACK_RESULT() - Pack the result of a relocation calculation for storage */
+/* in the relocation field. */
+/*****************************************************************************/
+static int32_t pack_result(int32_t unpacked_result, int r_type)
+{
+ switch(r_type)
+ {
+ case R_C6000_ABS32:
+ case R_C6000_ABS16:
+ case R_C6000_ABS8:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ return unpacked_result;
+
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_PREL31:
+ return unpacked_result >> 1;
+
+ case R_C6000_PCR_S21:
+ case R_C6000_PCR_S12:
+ case R_C6000_PCR_S10:
+ case R_C6000_PCR_S7:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_DSBT_INDEX:
+ return unpacked_result >> 2;
+
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_H16_B:
+ return unpacked_result >> 16;
+
+ case R_C6000_SBR_H16_H:
+ return unpacked_result >> 17;
+
+ case R_C6000_SBR_H16_W:
+ return unpacked_result >> 18;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "pack_result called with invalid relocation type!\n");
+ return 0;
+ }
+}
+
+/*****************************************************************************/
+/* MASK_RESULT() - Mask the result of a relocation calculation so that it */
+/* fits the size of the relocation type's field. */
+/*****************************************************************************/
+static int32_t mask_result(int32_t unmasked_result, int r_type)
+{
+ switch(r_type)
+ {
+ case R_C6000_ABS8:
+ return unmasked_result & 0xFF;
+
+ case R_C6000_ABS32:
+ return unmasked_result;
+
+ case R_C6000_ABS16:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ return unmasked_result & 0xFFFF;
+
+ case R_C6000_PCR_S21:
+ return unmasked_result & 0x1FFFFF;
+
+ case R_C6000_PCR_S12:
+ return unmasked_result & 0xFFF;
+
+ case R_C6000_PCR_S10:
+ return unmasked_result & 0x3FF;
+
+ case R_C6000_PCR_S7:
+ return unmasked_result & 0x7F;
+
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_DSBT_INDEX:
+ return unmasked_result & 0x7FFF;
+
+ case R_C6000_PREL31:
+ return unmasked_result & 0x7FFFFFFF;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ default:
+ DLIF_error(DLET_RELOC,
+ "mask_result called with invalid relocation type!\n");
+ return 0;
+ }
+}
+
+/*****************************************************************************/
+/* REL_OVERFLOW() */
+/* */
+/* Check relocation value against the range associated with a given */
+/* relocation type field size and signedness. */
+/* */
+/*****************************************************************************/
+static BOOL rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* Select appropriate range check based on relocation type. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ case R_C6000_ABS16: return ((reloc_value > 65535) ||
+ (reloc_value < -32768));
+ case R_C6000_ABS8: return ((reloc_value > 255) ||
+ (reloc_value < -128));
+ case R_C6000_PCR_S21: return ((reloc_value >= 0x400000) ||
+ (reloc_value < -0x400000));
+ case R_C6000_PCR_S12: return ((reloc_value >= 0x2000) ||
+ (reloc_value < -0x2000));
+ case R_C6000_PCR_S10: return ((reloc_value >= 0x800) ||
+ (reloc_value < -0x800));
+ case R_C6000_PCR_S7: return ((reloc_value >= 0x100) ||
+ (reloc_value < -0x100));
+ case R_C6000_SBR_S16:
+ case R_C6000_ABS_S16: return ((reloc_value >= 0x8000) ||
+ (reloc_value < -0x8000));
+ case R_C6000_SBR_U15_B: return (((uint32_t)reloc_value) >= 0x8000);
+ case R_C6000_SBR_U15_H: return (((uint32_t)reloc_value) >= 0xFFFF);
+ case R_C6000_DSBT_INDEX:
+ case R_C6000_SBR_U15_W: return (((uint32_t)reloc_value) >= 0x1FFFD);
+
+
+ /*---------------------------------------------------------------------*/
+ /* Some relocation types suppress overflow checking at link-time. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ return 0;
+
+ /*---------------------------------------------------------------------*/
+ /* 32-bit relocation field values are not checked for overflow. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS32:
+ case R_C6000_PREL31:
+ return 0;
+
+ /*---------------------------------------------------------------------*/
+ /* If relocation type did not appear in the above switch, then we */
+ /* didn't expect to see it. */
+ /*---------------------------------------------------------------------*/
+ default:
+ DLIF_error(DLET_RELOC,
+ "rel_overflow called with invalid relocation type!\n");
+ }
+
+ return 1;
+}
+
+#if LOADER_DEBUG || LOADER_PROFILE
+extern int DLREL_relocations;
+extern time_t DLREL_total_reloc_time;
+#endif
+
+/*****************************************************************************/
+/* RELOC_DO() - Process a single relocation entry. */
+/*****************************************************************************/
+static void reloc_do(C60_RELOC_TYPE r_type,
+ uint32_t segment_vaddr,
+ uint8_t *segment_buffer,
+ uint32_t addend,
+ uint32_t symval,
+ uint32_t spc,
+ int wrong_endian,
+ uint32_t base_pointer,
+ int32_t dsbt_index)
+{
+ int32_t reloc_value = 0;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* In debug mode, keep a count of the number of relocations processed. */
+ /* In profile mode, start the clock on a given relocation. */
+ /*------------------------------------------------------------------------*/
+ int start_time = 0;
+ if (debugging_on || profiling_on)
+ {
+ DLREL_relocations++;
+ if (profiling_on) start_time = clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Calculate the relocation value according to the rules associated with */
+ /* the given relocation type. */
+ /*------------------------------------------------------------------------*/
+ switch(r_type)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Straight-Up Address relocations (address references). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_ABS32:
+ case R_C6000_ABS16:
+ case R_C6000_ABS8:
+ case R_C6000_ABS_S16:
+ case R_C6000_ABS_L16:
+ case R_C6000_ABS_H16:
+ reloc_value = symval + addend;
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* PC-Relative relocations (calls and branches). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_PCR_S21:
+ case R_C6000_PCR_S12:
+ case R_C6000_PCR_S10:
+ case R_C6000_PCR_S7:
+ {
+ /*------------------------------------------------------------------*/
+ /* Add SPC to segment address to get the PC. Mask for exec-packet */
+ /* boundary. */
+ /*------------------------------------------------------------------*/
+ int32_t opnd_p = (spc + segment_vaddr) & 0xffffffe0;
+ reloc_value = symval + addend - opnd_p;
+ break;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* "Place"-relative relocations (TDEH). */
+ /*---------------------------------------------------------------------*/
+ /* These relocations occur in data and refer to a label that occurs */
+ /* at some signed 32-bit offset from the place where the relocation */
+ /* occurs. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_PREL31:
+ {
+ /*------------------------------------------------------------------*/
+ /* Compute location of relocation entry and subtract it from the */
+ /* address of the location being referenced (it is computed very */
+ /* much like a PC-relative relocation, but it occurs in data and */
+ /* is called a "place"-relative relocation). */
+ /*------------------------------------------------------------------*/
+ /* If this is an Elf32_Rel type relocation, then addend is assumed */
+ /* to have been scaled when it was unpacked (field << 1). */
+ /*------------------------------------------------------------------*/
+ /* For Elf32_Rela type relocations the addend is assumed to be a */
+ /* signed 32-bit integer value. */
+ /*------------------------------------------------------------------*/
+ /* Offset is not fetch-packet relative; doesn't need to be masked. */
+ /*------------------------------------------------------------------*/
+ int32_t opnd_p = (spc + segment_vaddr);
+ reloc_value = symval + addend - opnd_p;
+ break;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Static-Base Relative relocations (near-DP). */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_SBR_U15_B:
+ case R_C6000_SBR_U15_H:
+ case R_C6000_SBR_U15_W:
+ case R_C6000_SBR_S16:
+ case R_C6000_SBR_L16_B:
+ case R_C6000_SBR_L16_H:
+ case R_C6000_SBR_L16_W:
+ case R_C6000_SBR_H16_B:
+ case R_C6000_SBR_H16_H:
+ case R_C6000_SBR_H16_W:
+ reloc_value = symval + addend - base_pointer;
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* R_C6000_DSBT_INDEX - uses value assigned by the dynamic loader to */
+ /* be the DSBT index for this module as a scaled offset when */
+ /* referencing the DSBT. The DSBT base address is in symval and the */
+ /* static base is in base_pointer. DP-relative offset to slot in */
+ /* DSBT is the offset of the DSBT relative to the DP plus the */
+ /* scaled DSBT index into the DSBT. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_DSBT_INDEX:
+ reloc_value = ((symval + addend) - base_pointer) + (dsbt_index << 2);
+ break;
+
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocation: after DSO initialization, */
+ /* copy the named object from the DSO into the executable's BSS */
+ /*---------------------------------------------------------------------*/
+ /* Linux "import-as-own" copy relocations are not yet supported. */
+ /*---------------------------------------------------------------------*/
+ case R_C6000_COPY:
+
+ /*---------------------------------------------------------------------*/
+ /* Unrecognized relocation type. */
+ /*---------------------------------------------------------------------*/
+ default:
+ DLIF_error(DLET_RELOC,
+ "reloc_do called with invalid relocation type!\n");
+ break;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Overflow checking. Is relocation value out of range for the size and */
+ /* type of the current relocation? */
+ /*------------------------------------------------------------------------*/
+ if (rel_overflow(r_type, reloc_value))
+ DLIF_error(DLET_RELOC, "relocation overflow!\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Move relocation value to appropriate offset for relocation field's */
+ /* location. */
+ /*------------------------------------------------------------------------*/
+ reloc_value = pack_result(reloc_value, r_type);
+
+ /*------------------------------------------------------------------------*/
+ /* Mask packed result to the size of the relocation field. */
+ /*------------------------------------------------------------------------*/
+ reloc_value = mask_result(reloc_value, r_type);
+
+ /*------------------------------------------------------------------------*/
+ /* If necessary, Swap endianness of data at relocation address. */
+ /*------------------------------------------------------------------------*/
+ if (wrong_endian)
+ DLIMP_change_endian32((int32_t*)(segment_buffer + spc));
+
+ /*------------------------------------------------------------------------*/
+ /* Write the relocated 4-byte packet back to the segment buffer. */
+ /*------------------------------------------------------------------------*/
+ write_reloc_r(segment_buffer, spc, r_type, reloc_value);
+
+ /*------------------------------------------------------------------------*/
+ /* Change endianness of segment address back to original. */
+ /*------------------------------------------------------------------------*/
+ if (wrong_endian)
+ DLIMP_change_endian32((int32_t*)(segment_buffer + spc));
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* In profile mode, add elapsed time for this relocation to total time */
+ /* spent doing relocations. */
+ /*------------------------------------------------------------------------*/
+ if (profiling_on)
+ DLREL_total_reloc_time += (clock() - start_time);
+ if (debugging_on)
+ DLIF_trace("reloc_value = 0x%x\n", reloc_value);
+#endif
+}
+
+/*****************************************************************************/
+/* REL_UNPACK_ADDEND() */
+/* */
+/* Unpack addend value from the relocation field. */
+/* */
+/*****************************************************************************/
+static void rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t *address,
+ uint32_t *addend)
+{
+ /*------------------------------------------------------------------------*/
+ /* C6000 does not support Elf32_Rel type relocations in the dynamic */
+ /* loader core. We will emit an internal error and abort until this */
+ /* support is added. I abort here because this is necessarily a target- */
+ /* specific part of the relocation infrastructure. */
+ /*------------------------------------------------------------------------*/
+ *addend = 0;
+
+ DLIF_error(DLET_RELOC,
+ "Internal Error: unpacking addend values from the relocation "
+ "field is not supported in the C6000 dynamic loader at this "
+ "time; aborting\n");
+ DLIF_exit(1);
+}
+
+/*****************************************************************************/
+/* REL_SWAP_ENDIAN() */
+/* */
+/* Return TRUE if we should change the endianness of a relocation field. */
+/* */
+/*****************************************************************************/
+static BOOL rel_swap_endian(DLIMP_Dynamic_Module *dyn_module,
+ C60_RELOC_TYPE r_type)
+{
+ if (dyn_module->wrong_endian) return TRUE;
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* REL_CHANGE_ENDIAN() */
+/* */
+/* Change the endianness of the relocation field at the specified address */
+/* in the segment's data. */
+/* */
+/*****************************************************************************/
+static void rel_change_endian(C60_RELOC_TYPE r_type, uint8_t *address)
+{
+ /*------------------------------------------------------------------------*/
+ /* On C6000, all instructions are 32-bits wide. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_change_endian32((int32_t *)address);
+}
+
+/*****************************************************************************/
+/* READ_REL_TABLE() */
+/* */
+/* Read in an Elf32_Rel type relocation table. This function allocates */
+/* host memory for the table. */
+/* */
+/*****************************************************************************/
+static void read_rel_table(struct Elf32_Rel **rel_table,
+ int32_t table_offset,
+ uint32_t relnum, uint32_t relent,
+ LOADER_FILE_DESC *fd, BOOL wrong_endian)
+{
+ if (relnum == 0) { *rel_table = NULL; return; }
+
+ *rel_table = (struct Elf32_Rel *)DLIF_malloc(relnum * relent);
+ DLIF_fseek(fd, table_offset, LOADER_SEEK_SET);
+ DLIF_fread(*rel_table, relnum, relent, fd);
+
+ if (wrong_endian)
+ {
+ int i;
+ for (i = 0; i < relnum; i++)
+ DLIMP_change_rel_endian(*rel_table + i);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_REL_TABLE() */
+/* */
+/* Process table of Elf32_Rel type relocations. */
+/* */
+/*****************************************************************************/
+static void process_rel_table(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Segment* seg,
+ struct Elf32_Rel *rel_table,
+ uint32_t relnum,
+ int32_t *start_relidx,
+ uint32_t ti_static_base,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ Elf32_Addr seg_start_addr = seg->input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz;
+ BOOL found = FALSE;
+ int32_t relidx = *start_relidx;
+
+ /*------------------------------------------------------------------------*/
+ /* If the given start reloc index is out of range, then start from the */
+ /* beginning of the given table. */
+ /*------------------------------------------------------------------------*/
+ if (relidx >= relnum) relidx = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through Elf32_Rel type relocation table. */
+ /*------------------------------------------------------------------------*/
+ for ( ; relidx < relnum; relidx++)
+ {
+ /*---------------------------------------------------------------------*/
+ /* If the relocation offset falls within the segment, process it. */
+ /*---------------------------------------------------------------------*/
+ if (rel_table[relidx].r_offset >= seg_start_addr &&
+ rel_table[relidx].r_offset < seg_end_addr)
+ {
+ Elf32_Addr r_symval = 0;
+ C60_RELOC_TYPE r_type =
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info);
+ int32_t r_symid = ELF32_R_SYM(rel_table[relidx].r_info);
+
+ uint8_t *reloc_address = NULL;
+ uint32_t pc = 0;
+ uint32_t addend = 0;
+
+ BOOL change_endian = FALSE;
+
+ found = TRUE;
+
+ /*------------------------------------------------------------------*/
+ /* If symbol definition is not found, don't do the relocation. */
+ /* An error is generated by the lookup function. */
+ /*------------------------------------------------------------------*/
+ if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval))
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Addend value is stored in the relocation field. */
+ /* We'll need to unpack it from the data for the segment that is */
+ /* currently being relocated. */
+ /*------------------------------------------------------------------*/
+ pc = rel_table[relidx].r_offset - seg->input_vaddr;
+ reloc_address = (uint8_t *)seg->host_address + pc;
+
+ change_endian = rel_swap_endian(dyn_module, r_type);
+ if (change_endian)
+ rel_change_endian(r_type, reloc_address);
+
+ rel_unpack_addend(
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rel_table[relidx].r_info),
+ reloc_address, &addend);
+
+ /*------------------------------------------------------------------*/
+ /* Perform actual relocation. This is a really wide function */
+ /* interface and could do with some encapsulation. */
+ /*------------------------------------------------------------------*/
+ reloc_do(r_type,
+ seg->phdr.p_vaddr,
+ seg->host_address,
+ addend,
+ r_symval,
+ pc,
+ dyn_module->wrong_endian,
+ ti_static_base,
+ dyn_module->dsbt_index);
+
+ }
+
+ else if (found)
+ break;
+ }
+}
+
+/*****************************************************************************/
+/* READ_RELA_TABLE() */
+/* */
+/* Read in an Elf32_Rela type relocation table. This function allocates */
+/* host memory for the table. */
+/* */
+/*****************************************************************************/
+static void read_rela_table(struct Elf32_Rela **rela_table,
+ int32_t table_offset,
+ uint32_t relanum, uint32_t relaent,
+ LOADER_FILE_DESC *fd, BOOL wrong_endian)
+{
+ if (relanum == 0) { *rela_table = NULL; return; }
+ *rela_table = (struct Elf32_Rela *)DLIF_malloc(relanum * relaent);
+ DLIF_fseek(fd, table_offset, LOADER_SEEK_SET);
+ DLIF_fread(*rela_table, relanum, relaent, fd);
+
+ if (wrong_endian)
+ {
+ int i;
+ for (i = 0; i < relanum; i++)
+ DLIMP_change_rela_endian(*rela_table + i);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_RELA_TABLE() */
+/* */
+/* Process a table of Elf32_Rela type relocations. */
+/* */
+/*****************************************************************************/
+static void process_rela_table(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Segment *seg,
+ struct Elf32_Rela *rela_table,
+ uint32_t relanum,
+ int32_t *start_relidx,
+ uint32_t ti_static_base,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ Elf32_Addr seg_start_addr = seg->input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg->phdr.p_memsz;
+ BOOL found = FALSE;
+ int32_t relidx = *start_relidx;
+
+ /*-----------------------------------------------------------------------*/
+ /* If the given start reloc index is out of range, then start from */
+ /* the beginning of the given table. */
+ /*-----------------------------------------------------------------------*/
+ if (relidx > relanum) relidx = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* Spin through RELA relocation table. */
+ /*-----------------------------------------------------------------------*/
+ for ( ; relidx < relanum; relidx++)
+ {
+ /*-------------------------------------------------------------------*/
+ /* If the relocation offset falls within the segment, process it. */
+ /*-------------------------------------------------------------------*/
+ if (rela_table[relidx].r_offset >= seg_start_addr &&
+ rela_table[relidx].r_offset < seg_end_addr)
+ {
+ Elf32_Addr r_symval;
+ C60_RELOC_TYPE r_type =
+ (C60_RELOC_TYPE)ELF32_R_TYPE(rela_table[relidx].r_info);
+ int32_t r_symid = ELF32_R_SYM(rela_table[relidx].r_info);
+
+ found = TRUE;
+
+ /*---------------------------------------------------------------*/
+ /* If symbol definition is not found, don't do the relocation. */
+ /* An error is generated by the lookup function. */
+ /*---------------------------------------------------------------*/
+ if (!DLSYM_canonical_lookup(handle, r_symid, dyn_module, &r_symval))
+ continue;
+
+ /*---------------------------------------------------------------*/
+ /* Perform actual relocation. This is a really wide function */
+ /* interface and could do with some encapsulation. */
+ /*---------------------------------------------------------------*/
+ reloc_do(r_type,
+ seg->phdr.p_vaddr,
+ seg->host_address,
+ rela_table[relidx].r_addend,
+ r_symval,
+ rela_table[relidx].r_offset - seg->input_vaddr,
+ dyn_module->wrong_endian,
+ ti_static_base,
+ dyn_module->dsbt_index);
+ }
+
+ else if (found)
+ break;
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_GOT_RELOCS() */
+/* */
+/* Process all GOT relocations. It is possible to have both Elf32_Rel */
+/* and Elf32_Rela type relocations in the same file, so we handle tham */
+/* both. */
+/* */
+/*****************************************************************************/
+static void process_got_relocs(DLOAD_HANDLE handle,
+ struct Elf32_Rel* rel_table, uint32_t relnum,
+ struct Elf32_Rela* rela_table, uint32_t relanum,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ DLIMP_Loaded_Segment *seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size;
+ int32_t rel_relidx = 0;
+ int32_t rela_relidx = 0;
+ uint32_t seg_idx = 0;
+ uint32_t ti_static_base = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Get the value of the static base (__TI_STATIC_BASE) which will be */
+ /* passed into the relocation table processing functions. */
+ /*------------------------------------------------------------------------*/
+ if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum, &ti_static_base))
+ DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Process relocations segment by segment. */
+ /*------------------------------------------------------------------------*/
+ for (seg_idx = 0; seg_idx < num_segs; seg_idx++)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Relocations should not occur in uninitialized segments. */
+ /*---------------------------------------------------------------------*/
+ if (!seg[seg_idx].phdr.p_filesz) continue;
+
+ if (rela_table)
+ process_rela_table(handle, (seg + seg_idx),
+ rela_table, relanum, &rela_relidx,
+ ti_static_base, dyn_module);
+
+ if (rel_table)
+ process_rel_table(handle, (seg + seg_idx),
+ rel_table, relnum, &rel_relidx,
+ ti_static_base, dyn_module);
+ }
+}
+
+/*****************************************************************************/
+/* PROCESS_PLTGOT_RELOCS() */
+/* */
+/* Process all PLTGOT relocation entries. The PLTGOT relocation table */
+/* can be either Elf32_Rel or Elf32_Rela type. All PLTGOT relocations */
+/* ar guaranteed to belong to the same segment. */
+/* */
+/*****************************************************************************/
+static void process_pltgot_relocs(DLOAD_HANDLE handle,
+ void* plt_reloc_table,
+ int reltype,
+ uint32_t pltnum,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ Elf32_Addr r_offset = (reltype == DT_REL) ?
+ ((struct Elf32_Rel *)plt_reloc_table)->r_offset :
+ ((struct Elf32_Rela *)plt_reloc_table)->r_offset;
+
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+
+ uint32_t num_segs = dyn_module->loaded_module->loaded_segments.size;
+ int32_t plt_relidx = 0;
+ uint32_t seg_idx = 0;
+ uint32_t ti_static_base = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Get the value of the static base (__TI_STATIC_BASE) which will be */
+ /* passed into the relocation table processing functions. */
+ /*------------------------------------------------------------------------*/
+ if (!DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum, &ti_static_base))
+ DLIF_error(DLET_RELOC, "Could not resolve value of __TI_STATIC_BASE\n");
+
+ /*------------------------------------------------------------------------*/
+ /* For each segment s, check if the relocation falls within s. If so, */
+ /* then all other relocations are guaranteed to fall with s. Process */
+ /* all relocations and then return. */
+ /*------------------------------------------------------------------------*/
+ for (seg_idx = 0; seg_idx < num_segs; seg_idx++)
+ {
+ Elf32_Addr seg_start_addr = seg[seg_idx].input_vaddr;
+ Elf32_Addr seg_end_addr = seg_start_addr + seg[seg_idx].phdr.p_memsz;
+
+ /*---------------------------------------------------------------------*/
+ /* Relocations should not occur in uninitialized segments. */
+ /*---------------------------------------------------------------------*/
+ if(!seg[seg_idx].phdr.p_filesz) continue;
+
+ if (r_offset >= seg_start_addr &&
+ r_offset < seg_end_addr)
+ {
+ if (reltype == DT_REL)
+ process_rel_table(handle, (seg + seg_idx),
+ (struct Elf32_Rel *)plt_reloc_table,
+ pltnum, &plt_relidx,
+ ti_static_base, dyn_module);
+ else
+ process_rela_table(handle, (seg + seg_idx),
+ (struct Elf32_Rela *)plt_reloc_table,
+ pltnum, &plt_relidx,
+ ti_static_base, dyn_module);
+
+ break;
+ }
+ }
+}
+
+/*****************************************************************************/
+/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */
+/* file that we are in the process of loading and relocating. */
+/*****************************************************************************/
+void DLREL_c60_relocate(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module)
+{
+ struct Elf32_Dyn *dyn_nugget = dyn_module->dyntab;
+ struct Elf32_Rela *rela_table = NULL;
+ struct Elf32_Rel *rel_table = NULL;
+ struct Elf32_Rela *rela_plt_table = NULL;
+ struct Elf32_Rel *rel_plt_table = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_RELASZ) and the size per */
+ /* relocation (DT_RELAENT) from the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ uint32_t relasz = DLIMP_get_first_dyntag(DT_RELASZ, dyn_nugget);
+ uint32_t relaent = DLIMP_get_first_dyntag(DT_RELAENT, dyn_nugget);
+ uint32_t relanum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_RELSZ) and the size per */
+ /* relocation (DT_RELENT) from the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ uint32_t relsz = DLIMP_get_first_dyntag(DT_RELSZ, dyn_nugget);
+ uint32_t relent = DLIMP_get_first_dyntag(DT_RELENT, dyn_nugget);
+ uint32_t relnum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Read the size of the relocation table (DT_PLTRELSZ) and the type of */
+ /* of the PLTGOT relocation table (DT_PLTREL): one of DT_REL or DT_RELA */
+ /*------------------------------------------------------------------------*/
+ uint32_t pltrelsz = DLIMP_get_first_dyntag(DT_PLTRELSZ, dyn_nugget);
+ int pltreltyp = DLIMP_get_first_dyntag(DT_PLTREL, dyn_nugget);
+ uint32_t pltnum = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Find/record DSBT index associated with this module. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module) &&
+ (dyn_module->dsbt_index == DSBT_INDEX_INVALID))
+ dyn_module->dsbt_index =
+ DLIF_get_dsbt_index(dyn_module->loaded_module->file_handle);
+
+ /*------------------------------------------------------------------------*/
+ /* Read the PLTGOT relocation table from the file */
+ /* The PLTGOT table is a subsection at the end of either the DT_REL or */
+ /* DT_RELA table. The size of the table it belongs to DT_REL(A)SZ */
+ /* includes the size of the PLTGOT table. So it must be adjusted so that */
+ /* the GOT relocation tables only contain actual GOT relocations. */
+ /*------------------------------------------------------------------------*/
+ if (pltrelsz != INT_MAX && pltrelsz != 0)
+ {
+ if (pltreltyp == DT_REL)
+ {
+ pltnum = pltrelsz/relent;
+ relsz -= pltrelsz;
+ read_rel_table((&rel_plt_table),
+ DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget),
+ pltnum, relent, fd, dyn_module->wrong_endian);
+ }
+
+ else if (pltreltyp == DT_RELA)
+ {
+ pltnum = pltrelsz/relaent;
+ relasz -= pltrelsz;
+ read_rela_table((&rela_plt_table),
+ DLIMP_get_first_dyntag(DT_JMPREL, dyn_nugget),
+ pltnum, relaent, fd, dyn_module->wrong_endian);
+ }
+
+ else
+ {
+ DLIF_error(DLET_RELOC,
+ "DT_PLTREL is invalid: must be either %d or %d\n",
+ DT_REL, DT_RELA);
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the DT_RELA GOT relocation table from the file */
+ /*------------------------------------------------------------------------*/
+ if (relasz != INT_MAX && relasz != 0)
+ {
+ relanum = relasz/relaent;
+ read_rela_table(&rela_table, DLIMP_get_first_dyntag(DT_RELA, dyn_nugget),
+ relanum, relaent, fd, dyn_module->wrong_endian);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the DT_REL GOT relocation table from the file */
+ /*------------------------------------------------------------------------*/
+ if (relsz != INT_MAX && relsz != 0)
+ {
+ relnum = relsz/relent;
+ read_rel_table(&rel_table, DLIMP_get_first_dyntag(DT_REL, dyn_nugget),
+ relnum, relent, fd, dyn_module->wrong_endian);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Process the PLTGOT relocations */
+ /*------------------------------------------------------------------------*/
+ if (rela_plt_table)
+ process_pltgot_relocs(handle, rela_plt_table, pltreltyp, pltnum,
+ dyn_module);
+
+ if (rel_plt_table)
+ process_pltgot_relocs(handle, rel_plt_table, pltreltyp, pltnum,
+ dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Process the GOT relocations */
+ /*------------------------------------------------------------------------*/
+ if (rel_table || rela_table)
+ process_got_relocs(handle, rel_table, relnum, rela_table, relanum,
+ dyn_module);
+
+ /*-------------------------------------------------------------------------*/
+ /* Free memory used for ELF relocation table copies. */
+ /*-------------------------------------------------------------------------*/
+ if (rela_table) DLIF_free(rela_table);
+ if (rel_table) DLIF_free(rel_table);
+ if (rela_plt_table) DLIF_free(rela_plt_table);
+ if (rel_plt_table) DLIF_free(rel_plt_table);
+}
+
+/*****************************************************************************/
+/* UNIT TESTING INTERFACE */
+/*****************************************************************************/
+#ifdef UNIT_TEST
+void unit_c60_reloc_do(C60_RELOC_TYPE r_type,
+ uint8_t *address_space,
+ uint32_t addend, uint32_t symval, uint32_t pc,
+ uint32_t static_base, int wrong_endian,
+ int32_t dsbt_index)
+{
+ reloc_do(r_type, (uint32_t)address_space, address_space,
+ addend, symval, pc, FALSE, static_base, dsbt_index);
+}
+
+#if 0 /* RELA TYPE RELOCATIONS HAVE ADDEND IN RELOCATION ENTRY */
+void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t* address,
+ uint32_t* addend)
+{
+ rel_unpack_addend(r_type, address, addend);
+}
+#endif
+
+BOOL unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value)
+{
+ return rel_overflow(r_type, reloc_value);
+}
+#endif
+
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h
new file mode 100644
index 0000000..8ccd60e
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/c60_reloc.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+void DLREL_c60_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module);
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp
new file mode 100644
index 0000000..acde023
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.cpp
@@ -0,0 +1,825 @@
+/*
+* test_c60_reloc.cpp
+*
+* C6x Relocation Unit Tests.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "test_c60_reloc.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/*****************************************************************************/
+/* C60_TestRelocDo */
+/* */
+/* Tests the C60 version of reloc_do. In cases where multiple relocation */
+/* types are implemented in the same way, only one type is tested. For */
+/* instance, R_C6000_xxx, R_C6000_yyy, and R_C6000_zzz are implemented in */
+/* the exact same way and, therefore, only R_C6000_xxx is tested. */
+/* */
+/* Each test follows the same flow: */
+/* 1. A valid instruction is constructed for the relocation type being */
+/* tested. */
+/* 2. Addend, symbol value, and pc are then created. */
+/* (NOTE: static base is not needed, and so 0 is passed. Also, same */
+/* endianness is assumed.) */
+/* 3. reloc_do() is called */
+/* 4. The result is checked. */
+/* 5. Repeat if variations should be considered. */
+/* */
+/*****************************************************************************/
+//void C60_TestRelocDo::test_R_C6000_NONE() { }
+
+void C60_TestRelocDo::test_R_C6000_ABS32()
+{
+ uint32_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x2001000;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS32,
+ (uint8_t*) &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x2001004);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS16()
+{
+ uint16_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xFFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS16,
+ (uint8_t*) &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x1002);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS8()
+{
+ uint8_t address_space = 0x0;
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS8,
+ &address_space,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(address_space, 0x12);
+}
+
+/*---------------------------------------------------------------------------*/
+/* PC-Relative Relocation Tests */
+/* */
+/* Our relocation handler assumes that the address of 'opcode' is where the */
+/* relocation is. Therefore, when creating a PCR test case, we will compute */
+/* a value for symval and pc in terms of &opcode. */
+/* */
+/*---------------------------------------------------------------------------*/
+void C60_TestRelocDo::test_R_C6000_PCR_S21()
+{
+ uint32_t opcode = 0x00000010;
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50000;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR21 offset = 0x14001 */
+ unit_c60_reloc_do(R_C6000_PCR_S21,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x00a00090);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR21 offset = 0x1d4001 (signed - negative) */
+ opcode = 0x00000010;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0000;
+ unit_c60_reloc_do(R_C6000_PCR_S21,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0ea00090);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S12()
+{
+ uint32_t opcode = 0x00002120; /* BNOP */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x500;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR12 offset = 0x141 */
+ unit_c60_reloc_do(R_C6000_PCR_S12,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x01412120);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR12 offset = 0xd41 (signed - negative) */
+ opcode = 0x00002120;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb00;
+ unit_c60_reloc_do(R_C6000_PCR_S12,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0d412120);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S10()
+{
+ uint32_t opcode = 0x01001020; /* BDEC */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR10 offset = 0x15 */
+ unit_c60_reloc_do(R_C6000_PCR_S10,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0102b020);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR10 offset = 0x355 (signed - negative) */
+ opcode = 0x01001020;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0xb0;
+ unit_c60_reloc_do(R_C6000_PCR_S10,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x017ab020);
+}
+
+void C60_TestRelocDo::test_R_C6000_PCR_S7()
+{
+ uint32_t opcode = 0x03006160; /* ADDKPC */
+ uint32_t addend = 0x4;
+ uint32_t symval = ((uint32_t)&opcode & 0xffffffe0) + 0x50;
+ uint32_t pc = 0x0;
+
+ /* Test #1 -- destination is forward from PC */
+ /* PCR7 offset = 0x15 */
+ unit_c60_reloc_do(R_C6000_PCR_S7,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03156160);
+
+ /* Test #2 -- symval definition implies offset is negative */
+ /* PCR7 offset = 0x75 (signed - negative) */
+ opcode = 0x03006160;
+ symval = ((uint32_t)&opcode & 0xffffffe0) - 0x30;
+ unit_c60_reloc_do(R_C6000_PCR_S7,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03756160);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_S16()
+{
+ uint32_t opcode = 0x03000028; /* MVK */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0xFFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03080128);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_L16()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x04560FFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_L16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03080128);
+}
+
+void C60_TestRelocDo::test_R_C6000_ABS_H16()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x4;
+ uint32_t symval = 0x04560FFE;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_ABS_H16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, 0, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03022b68);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_B()
+{
+ uint32_t opcode = 0x0300002c; /* LDB */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1357);
+ uint32_t pc = 0x0;
+
+ /* unsigned 15-bit SBR offset = 0x1357 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0313572c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_H()
+{
+ uint32_t opcode = 0x0300004c; /* LDH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x2246);
+ uint32_t pc = 0x0;
+
+ /* unsigned 16-bit SBR offset = 0x2246 */
+ /* scaled 15-bit SBR offset = 0x1123 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0311234c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_U15_W()
+{
+ uint32_t opcode = 0x0300006c; /* LDW */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x448c);
+ uint32_t pc = 0x0;
+
+ /* unsigned 17-bit SBR offset = 0x448c */
+ /* scaled 15-bit SBR offset = 0x1123 */
+ /* encoded in bits 22 - 8 */
+ unit_c60_reloc_do(R_C6000_SBR_U15_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0311236c);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_S16()
+{
+ uint32_t opcode = 0x03000028; /* MVK */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1357);
+ uint32_t pc = 0x0;
+
+ /* Test #1 positive signed 16-bit offset */
+ /* 16-bit SBR offset = 0x1357 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0309aba8);
+
+ /* Test #2 negative signed 16-bit offset */
+ /* 16-bit SBR offset = 0xeca9 (-0x1357) */
+ /* encoded in bits 22-7 of opcode */
+ symval = (static_base - 0x1357);
+ unit_c60_reloc_do(R_C6000_SBR_S16,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x037654a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_B()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x11123);
+ uint32_t pc = 0x0;
+
+ /* 16-bit SBR offset = 0x1123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x030891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_H()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x12246);
+ uint32_t pc = 0x0;
+
+ /* 17-bit SBR offset = 0x12246 */
+ /* scaled SBR offset = 0x9123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x034891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_L16_W()
+{
+ uint32_t opcode = 0x03000028; /* MVKL */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x1448c);
+ uint32_t pc = 0x0;
+
+ /* 18-bit SBR offset = 0x1448c */
+ /* scaled SBR offset = 0x5123 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_L16_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x032891a8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_B()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* upper 16-bits of SBR offset = 0x357 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_B,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0301abe8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_H()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* scaled SBR offset = 0x1aba246 */
+ /* upper 16-bits of scaled SBR offset = 0x1ab */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_H,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x0300d5e8);
+}
+
+void C60_TestRelocDo::test_R_C6000_SBR_H16_W()
+{
+ uint32_t opcode = 0x03000068; /* MVKH */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = (static_base + 0x357448c);
+ uint32_t pc = 0x0;
+
+ /* total SBR offset = 0x357448c */
+ /* scaled SBR offset = 0x0d5d123 */
+ /* upper 16-bits of scaled SBR offset = 0x0d5 */
+ /* encoded in bits 22-7 of opcode */
+ unit_c60_reloc_do(R_C6000_SBR_H16_W,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 0);
+
+ TS_ASSERT_EQUALS(opcode, 0x03006ae8);
+}
+
+/* The DSBT table is accessed via DP-relative addressing with */
+/* an LDW instruction, but the DSBT_INDEX is really an index */
+/* into the DSBT table, the index is scaled to a 4-word offset. */
+void C60_TestRelocDo::test_R_C6000_DSBT_INDEX()
+{
+ uint32_t opcode = 0x0300006c; /* LDW */
+ uint32_t addend = 0x0;
+ uint32_t static_base = 0x04000000;
+ uint32_t symval = static_base;
+ uint32_t pc = 0x0;
+
+ unit_c60_reloc_do(R_C6000_DSBT_INDEX,
+ (uint8_t*) &opcode,
+ addend, symval, pc, static_base, 0, 3);
+
+ TS_ASSERT_EQUALS(opcode, 0x0300036c);
+}
+
+/*****************************************************************************/
+/* C60_TestRelUnpackAddend */
+/* */
+/* Tests the C60 rel_unpack_addend function. */
+/* */
+/* In cases where the addends are unpacked in the same way, only one is */
+/* tested. */
+/* */
+/* All tests follow the same flow: */
+/* */
+/* 1. Create a valid instruction for the relocation type, where the addend */
+/* is packed in the instruction. */
+/* 2. Call rel_unpack_addend(). */
+/* 3. Check that the addend is correct. */
+/* */
+/* Relocations may be tested multiple times to handle variations, such as */
+/* positive/negative addends, extra bits depending on the encoding, etc. */
+/* */
+/* NOTE!! C60 ONLY SUPPORTS RELA TYPE RELOCATIONS, SO ADDEND FIELD IS STORED */
+/* IN RELOCATION ENTRY ITSELF. */
+/*****************************************************************************/
+#if 0
+void C60_TestRelUnpackAddend::test_R_C6000_ABS32()
+{
+ uint32_t address_space=0xFEDCBA9;
+ uint32_t addend;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS32,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, address_space);
+}
+
+void C60_TestRelUnpackAddend::test_R_C6000_ABS16()
+{
+ uint16_t address_space=0x7FFF;
+ uint32_t addend;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS16,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, 0x7FFF);
+
+ address_space = 0x8000;
+
+ unit_c60_rel_unpack_addend(R_C6000_ABS16,
+ (uint8_t*)&address_space,
+ &addend);
+
+ TS_ASSERT_EQUALS(addend, 0xFFFF8000);
+}
+#endif
+
+
+/*****************************************************************************/
+/* C60_TestRelOverflow */
+/* */
+/* Test the C60 rel_overflow function. */
+/* */
+/* In each case, we test the upper and lower bounds of each relocation type. */
+/* Only relocation types where the overflow is checked in rel_overflow are */
+/* considered. In most cases four tests are performed to test the upper and */
+/* lower bounds (1 pass and 1 fail for each). */
+/* */
+/* NOTE!! HAVEN'T REFACTORED OVERFLOW CHECK OUT OF RELOCATION HANDLERS FOR */
+/* C60, SO OVERFLOW SHOULD BE TESTED AS PART OF THE RELOC DO(???) */
+/* */
+/*****************************************************************************/
+void C60_TestRelOverflow::test_R_C6000_ABS16()
+{
+ int32_t reloc_val = 0xFFFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x10000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_ABS8()
+{
+ int32_t reloc_val = 0xFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x80;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x81;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS8, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S21()
+{
+ int32_t reloc_val = 0x3FFFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x400000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x400000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x400001;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S21, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S12()
+{
+ int32_t reloc_val = 0x1FFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x2000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x2000;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x2001;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S12, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S10()
+{
+ int32_t reloc_val = 0x7FC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x800;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x800;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x801;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S10, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_PCR_S7()
+{
+ int32_t reloc_val = 0xFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x100;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x101;
+
+ rval = unit_c60_rel_overflow(R_C6000_PCR_S7, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_S16()
+{
+ int32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_ABS_S16()
+{
+ int32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+
+ reloc_val = -0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = -0x8001;
+
+ rval = unit_c60_rel_overflow(R_C6000_ABS_S16, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_B()
+{
+ uint32_t reloc_val = 0x7FFF;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x8000;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_B, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_H()
+{
+ uint32_t reloc_val = 0xFFFE;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0xFFFF;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_H, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_SBR_U15_W()
+{
+ uint32_t reloc_val = 0x1FFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x1FFFD;
+
+ rval = unit_c60_rel_overflow(R_C6000_SBR_U15_W, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
+void C60_TestRelOverflow::test_R_C6000_DSBT_INDEX()
+{
+ uint32_t reloc_val = 0x1FFFC;
+ int rval;
+
+ rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 0);
+
+ reloc_val = 0x1FFFD;
+
+ rval = unit_c60_rel_overflow(R_C6000_DSBT_INDEX, reloc_val);
+
+ TS_ASSERT_EQUALS(rval, 1);
+}
+
diff --git a/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h
new file mode 100644
index 0000000..67a437d
--- /dev/null
+++ b/src/core/dsp/ocl_load/C60_DLOAD_REL/test_c60_reloc.h
@@ -0,0 +1,101 @@
+/*
+* test_c60_reloc.h
+*
+* Specification of C6x-specific relocation handler unit tests.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef _TEST_C60_RELOC_H_
+#define _TEST_C60_RELOC_H_
+#include "c60_elf32.h"
+#include <cxxtest/TestSuite.h>
+
+extern "C"
+{
+extern void unit_c60_reloc_do(C60_RELOC_TYPE r_type, uint8_t* address,
+ uint32_t addend, uint32_t symval, uint32_t pc,
+ uint32_t base_pointer, int wrong_endian, int32_t dsbt_index);
+
+extern void unit_c60_rel_unpack_addend(C60_RELOC_TYPE r_type,
+ uint8_t* address,
+ uint32_t* addend);
+
+extern int unit_c60_rel_overflow(C60_RELOC_TYPE r_type, int32_t reloc_value);
+
+}
+
+class C60_TestRelocDo : public CxxTest::TestSuite
+{
+ public:
+ void test_R_C6000_ABS32();
+ void test_R_C6000_ABS16();
+ void test_R_C6000_ABS8();
+ void test_R_C6000_PCR_S21();
+ void test_R_C6000_PCR_S12();
+ void test_R_C6000_PCR_S10();
+ void test_R_C6000_PCR_S7();
+ void test_R_C6000_ABS_S16();
+ void test_R_C6000_ABS_L16();
+ void test_R_C6000_ABS_H16();
+ void test_R_C6000_SBR_U15_B();
+ void test_R_C6000_SBR_U15_H();
+ void test_R_C6000_SBR_U15_W();
+ void test_R_C6000_SBR_S16();
+ void test_R_C6000_SBR_L16_B();
+ void test_R_C6000_SBR_L16_H();
+ void test_R_C6000_SBR_L16_W();
+ void test_R_C6000_SBR_H16_B();
+ void test_R_C6000_SBR_H16_H();
+ void test_R_C6000_SBR_H16_W();
+ void test_R_C6000_DSBT_INDEX();
+};
+
+class C60_TestRelOverflow : public CxxTest::TestSuite
+{
+ public:
+ void test_R_C6000_ABS16();
+ void test_R_C6000_ABS8();
+ void test_R_C6000_PCR_S21();
+ void test_R_C6000_PCR_S12();
+ void test_R_C6000_PCR_S10();
+ void test_R_C6000_PCR_S7();
+ void test_R_C6000_SBR_S16();
+ void test_R_C6000_ABS_S16();
+ void test_R_C6000_SBR_U15_B();
+ void test_R_C6000_SBR_U15_H();
+ void test_R_C6000_SBR_U15_W();
+ void test_R_C6000_DSBT_INDEX();
+};
+
+#endif /* _TEST_C60_RELOC_H_ */
diff --git a/src/core/dsp/ocl_load/CMakeLists.txt b/src/core/dsp/ocl_load/CMakeLists.txt
new file mode 100644
index 0000000..a459542
--- /dev/null
+++ b/src/core/dsp/ocl_load/CMakeLists.txt
@@ -0,0 +1,26 @@
+include_directories (.
+ C60_DLOAD_REL
+ C60_DLOAD_DYN
+ DLOAD_SYM
+ DLOAD
+ DLOAD_API
+ DLWRAPPER
+ )
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -DC60_TARGET -DLOADER_DEBUG -g -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast")
+
+set(OCL_LOAD_SRC_FILES
+ ocl_load.c
+ C60_DLOAD_REL/c60_reloc.c
+ C60_DLOAD_DYN/c60_dynamic.c
+ DLOAD_SYM/symtab.c
+ DLOAD/ArrayList.c
+ DLOAD/dload.c
+ DLOAD/elf32.c
+ DLOAD/dload_endian.c
+)
+
+add_library(oclload STATIC ${OCL_LOAD_SRC_FILES})
+
+SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+
diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.c b/src/core/dsp/ocl_load/DLOAD/ArrayList.c
new file mode 100644
index 0000000..4452bfc
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.c
@@ -0,0 +1,122 @@
+/*
+* ArrayList.c
+*
+* Array_List is a C implementation of a C++ vector class.
+*
+* This class emulates a resizable array along the lines of a C++
+* vector or Java ArrayList class in C, and uses the convention
+* of passing a pointer to the current "object" as the first
+* argument.
+*
+* Usage is defined as follows:
+*
+* Array_List obj;
+* AL_initialize(&obj, sizeof(type_name));
+*
+* ...
+*
+* type_name *ptr = (type_name*)(obj.buf);
+* for(i = 0; i < AL_size(&obj); i++)
+* do_something_to(ptr[i]);
+* type_name to_append = ...;
+* AL_append(&obj, &to_append);
+*
+* ...
+*
+* AL_destroy(&obj);
+*
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <inttypes.h>
+#include <string.h>
+#include "ArrayList.h"
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* AL_INITIALIZE() - Initialize a newly created Array_List object. */
+/*****************************************************************************/
+void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem)
+{
+ if (num_elem == 0) num_elem = 1;
+ obj->buf = DLIF_malloc(type_size * num_elem);
+ obj->type_size = type_size;
+ obj->size = 0;
+ obj->buffer_size = num_elem;
+}
+
+/*****************************************************************************/
+/* AL_APPEND() - Append an element to the end of an Array_List. */
+/*****************************************************************************/
+void AL_append(Array_List* obj, void* to_append)
+{
+ /*------------------------------------------------------------------------*/
+ /* If there is already space in the specified buffer for the new data, */
+ /* just append it to the end of the data that is already in the buffer. */
+ /*------------------------------------------------------------------------*/
+ if (obj->size < obj->buffer_size)
+ memcpy(((uint8_t*)obj->buf) + obj->type_size * ((obj->size)++), to_append,
+ obj->type_size);
+
+ /*------------------------------------------------------------------------*/
+ /* Grow the buffer if we need more space to add the new data to it. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ void* old_buffer = obj->buf;
+ obj->buffer_size *= 2;
+ obj->buf = DLIF_malloc(obj->buffer_size*obj->type_size);
+ memcpy(obj->buf,old_buffer,obj->size*obj->type_size);
+ DLIF_free(old_buffer);
+ memcpy(((uint8_t*)obj->buf) + obj->type_size *((obj->size)++), to_append,
+ obj->type_size);
+ }
+}
+
+/*****************************************************************************/
+/* AL_SIZE() - Get the number of elements in an Array_List. */
+/*****************************************************************************/
+int32_t AL_size(Array_List* obj)
+{
+ return obj->size;
+}
+
+/*****************************************************************************/
+/* AL_DESTROY() - Free up memory associated with an Array_List that is no */
+/* longer in use. */
+/*****************************************************************************/
+void AL_destroy(Array_List* obj)
+{
+ DLIF_free(obj->buf);
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/ArrayList.h b/src/core/dsp/ocl_load/DLOAD/ArrayList.h
new file mode 100644
index 0000000..2c03788
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/ArrayList.h
@@ -0,0 +1,92 @@
+/*
+* ArrayList.h
+*
+* This implementation of ArrayList is a replacement for the C++
+* vector class in C.
+*
+* This class emulates a resizable array along the lines of a C++
+* vector or Java ArrayList class in C, and uses the convention
+* of passing a pointer to the current "object" as the first
+* argument.
+*
+* Usage is defined as follows:
+*
+* Array_List obj;
+* AL_initialize(&obj, sizeof(type_name));
+*
+* ...
+*
+* type_name *ptr = (type_name*)(obj.buf);
+* for(i = 0; i < AL_size(&obj); i++)
+* do_something_to(ptr[i]);
+* type_name to_append = ...;
+* AL_append(&obj, &to_append);
+*
+* ...
+*
+* AL_destroy(&obj);
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef ARRAYLIST_H
+#define ARRAYLIST_H
+
+#include <inttypes.h>
+
+/**********************************************************************/
+/* Array_List - structure type specification. */
+/**********************************************************************/
+typedef struct
+{
+ void *buf;
+ int32_t type_size;
+ int32_t size;
+ int32_t buffer_size;
+} Array_List;
+
+/*--------------------------------------------------------------------*/
+/* Array_List Member Functions: */
+/* */
+/* AL_initialize() - Initialize a newly created Array_List object. */
+/* AL_append() - Append an element to the end of an Array_List. */
+/* AL_size() - Get number of elements in an Array_List. */
+/* AL_destroy() - Free memory associated with an Array_List that is */
+/* no longer in use. */
+/*--------------------------------------------------------------------*/
+void AL_initialize(Array_List* obj, int32_t type_size, int32_t num_elem);
+void AL_append(Array_List* obj, void* to_append);
+int32_t AL_size(Array_List* obj);
+void AL_destroy(Array_List* obj);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/Queue.h b/src/core/dsp/ocl_load/DLOAD/Queue.h
new file mode 100644
index 0000000..3f85c16
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/Queue.h
@@ -0,0 +1,194 @@
+/*
+* Queue.h
+*
+* Interface to Linked List
+* ------------------------
+*
+* This is an implementation of a type-independent linked list class for C.
+* It's basically a template class, but uses macros instead so that it can
+* be compiled with a C-only compiler.
+*
+* To define a linked list class:
+* #include "Queue.h"
+* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Queue.h"
+* TYPE_QUEUE_DEFINITION(object_type,Class_Identifier)
+* TYPE_QUEUE_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a list:
+* Class_Identifier_Queue name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* Class_Identifier_initialize_queue(&name);
+*
+* To add to the list:
+* Class_Identifier_enqueue(&name, object);
+*
+* To iterate over the list:
+* Class_Identifier_Queue_Node *it = name.front;
+* while(it) { do_something_to_(it->value); it = it->next; }
+*
+* To delete from the list:
+* If it's the first node:
+* Class_Identifier_dequeue(&name);
+* If it's not:
+* predecessor_node->next_ptr = deleted_node->next_ptr;
+* name.size--;
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef QUEUE_H
+#define QUEUE_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_QUEUE_DEFINITION() - Define structure specifications for a linked */
+/* list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_QUEUE_DEFINITION(t, t_name) \
+struct t_name##_Queue_Node_ \
+{ \
+ t value; \
+ struct t_name##_Queue_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Queue_Node_ t_name##_Queue_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Queue_Node* front_ptr; \
+ t_name##_Queue_Node* back_ptr; \
+ int32_t size; \
+} t_name##_Queue; \
+ \
+extern void t_name##_initialize_queue(t_name##_Queue* queue); \
+extern void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue); \
+extern t t_name##_dequeue(t_name##_Queue* queue); \
+extern void t_name##_remove(t_name##_Queue* queue, t to_remove);
+
+/*****************************************************************************/
+/* TYPE_QUEUE_INITIALIZER() - Define the initializer to initialize Queues. */
+/*****************************************************************************/
+#define TYPE_QUEUE_INITIALIZER {NULL, NULL, 0}
+
+
+/*****************************************************************************/
+/* TYPE_QUEUE_IMPLEMENTATION() - Define member functions of new linked list */
+/* "class" of t_name objects. */
+/* */
+/* <type>_initialize_queue() - clears the queue */
+/* <type>_enqueue() - adds a <t> type object to the end of the queue */
+/* <type>_dequeue() - remove a <t> type object from the front of the queue */
+/* and provide access to it to the caller */
+/* <type>_remove() - find and remove a <t> type object from the queue */
+/*****************************************************************************/
+#define TYPE_QUEUE_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_queue (t_name##_Queue* queue) \
+{ \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ queue->size = 0; \
+} \
+void t_name##_enqueue(t_name##_Queue* queue, t to_enqueue) \
+{ \
+ queue->size++; \
+ \
+ if(!queue->back_ptr) \
+ queue->back_ptr = queue->front_ptr = \
+ (t_name##_Queue_Node*) \
+ (DLIF_malloc(sizeof(t_name##_Queue_Node))); \
+ else \
+ { \
+ queue->back_ptr->next_ptr = \
+ (t_name##_Queue_Node*)(DLIF_malloc( \
+ sizeof(t_name##_Queue_Node))); \
+ queue->back_ptr = queue->back_ptr->next_ptr; \
+ } \
+ \
+ queue->back_ptr->value = to_enqueue; \
+ queue->back_ptr->next_ptr = NULL; \
+} \
+ \
+t t_name##_dequeue(t_name##_Queue* queue) \
+{ \
+ t to_ret; \
+ t_name##_Queue_Node* next_ptr = NULL; \
+ \
+ if (!queue->size) return (t) NULL; \
+ \
+ next_ptr = queue->front_ptr->next_ptr; \
+ queue->size--; \
+ to_ret = queue->front_ptr->value; \
+ DLIF_free((void*)(queue->front_ptr)); \
+ \
+ if(!queue->size) \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ else \
+ queue->front_ptr = next_ptr; \
+ \
+ return to_ret; \
+} \
+ \
+void t_name##_remove(t_name##_Queue* queue, t to_remove) \
+{ \
+ t_name##_Queue_Node* prev_ptr = NULL; \
+ t_name##_Queue_Node* curr_ptr = queue->front_ptr; \
+ t_name##_Queue_Node* next_ptr = NULL; \
+ \
+ for (; curr_ptr; curr_ptr = next_ptr) \
+ { \
+ next_ptr = curr_ptr->next_ptr; \
+ if (curr_ptr->value == to_remove) break; \
+ prev_ptr = curr_ptr; \
+ } \
+ \
+ if (curr_ptr) \
+ { \
+ if (prev_ptr) prev_ptr->next_ptr = next_ptr; \
+ queue->size--; \
+ DLIF_free((void*)(curr_ptr)); \
+ } \
+ \
+ if (!queue->size) \
+ queue->front_ptr = queue->back_ptr = NULL; \
+ else \
+ { \
+ if (!prev_ptr) queue->front_ptr = next_ptr; \
+ if (!next_ptr) queue->back_ptr = prev_ptr; \
+ } \
+}
+
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/Stack.h b/src/core/dsp/ocl_load/DLOAD/Stack.h
new file mode 100644
index 0000000..d36f5e0
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/Stack.h
@@ -0,0 +1,155 @@
+/*
+* Stack.h
+*
+* Interface to Stack
+* ------------------
+*
+* This is an implementation of a type-independent stack implemented as
+* a signly linked list class for C. It's basically a template class, but
+* uses macros instead, so that it can be compiled with a C-only compiler.
+*
+* To define a Stack class:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a stack:
+* struct Class_Identifier_Stack name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* initialize_stack_Class_Identifier(&name);
+*
+* To add to the stack:
+* push_Class_Identifier(&name, object);
+*
+* To access the top of the stack:
+* Class_Identifier_Stack_Node *tos = name.top_ptr;
+* do_something_to_(tos->value);
+*
+* To delete from the stack:
+* if (name.size > 0) pop_Class_Identifier(&name);
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef STACK_H
+#define STACK_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */
+/* first-out linked list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_STACK_DEFINITION(t, t_name) \
+struct t_name##_Stack_Node_ \
+{ \
+ t value; \
+ struct t_name##_Stack_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Stack_Node* top_ptr; \
+ t_name##_Stack_Node* bottom_ptr; \
+ int size; \
+} t_name##_Stack; \
+ \
+extern void t_name##_initialize_stack(t_name##_Stack* stack); \
+extern void t_name##_push(t_name##_Stack* stack, t to_push); \
+extern t t_name##_pop(t_name##_Stack* stack);
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */
+/*****************************************************************************/
+#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 }
+
+/*****************************************************************************/
+/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */
+/* list "class" of t_name objects. */
+/* */
+/* <type>_initialize_stack() - clears the stack */
+/* <type>_push() - pushes a <t> type object to the top of the stack */
+/* <type>_pop() - pop a <t> type object from the top of the stack */
+/* and provide access to it to the caller */
+/*****************************************************************************/
+#define TYPE_STACK_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_stack (t_name##_Stack* stack) \
+{ \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ stack->size = 0; \
+} \
+void t_name##_push(t_name##_Stack* stack, t to_push) \
+{ \
+ stack->size++; \
+ \
+ if(!stack->top_ptr) \
+ { \
+ stack->bottom_ptr = stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = NULL; \
+ } \
+ else \
+ { \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr; \
+ stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = next_ptr; \
+ } \
+ \
+ stack->top_ptr->value = to_push; \
+} \
+ \
+t t_name##_pop(t_name##_Stack* stack) \
+{ \
+ t to_ret; \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \
+ \
+ stack->size--; \
+ to_ret = stack->top_ptr->value; \
+ DLIF_free((void*)(stack->top_ptr)); \
+ \
+ if(!stack->size) \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ else \
+ stack->top_ptr = next_ptr; \
+ \
+ return to_ret; \
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/dload.c b/src/core/dsp/ocl_load/DLOAD/dload.c
new file mode 100644
index 0000000..e5924d8
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload.c
@@ -0,0 +1,3534 @@
+/*
+* dload.c
+*
+* Core Dynamic Loader Reference Implementation
+*
+* This implementation of the core dynamic loader is platform independent,
+* but it is object file format dependent. In particular, this
+* implementation supports ELF object file format.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include <limits.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "ArrayList.h"
+#include "Queue.h"
+#include "Stack.h"
+
+#include "symtab.h"
+#include "dload_endian.h"
+#include "elf32.h"
+#include "dload.h"
+#include "relocate.h"
+#include "dload_api.h"
+
+#ifdef ARM_TARGET
+#include "arm_dynamic.h"
+#endif
+
+#ifdef C60_TARGET
+#include "c60_dynamic.h"
+#endif
+
+#include "virtual_targets.h"
+
+/*---------------------------------------------------------------------------*/
+/* These globals are used only to test the reference client implementation. */
+/*---------------------------------------------------------------------------*/
+int global_argc;
+char **global_argv;
+
+/*---------------------------------------------------------------------------*/
+/* Contains filenames (type const char*) the system is in the process of */
+/* loading. Used to detect cycles in incorrectly compiled ELF binaries. */
+/*---------------------------------------------------------------------------*/
+Array_List DLIMP_module_dependency_list;
+
+/*---------------------------------------------------------------------------*/
+/* Contains objects (type DLIMP_Loaded_Module) that the system has loaded into */
+/* target memory. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_IMPLEMENTATION(DLIMP_Loaded_Module*, loaded_module_ptr)
+loaded_module_ptr_Queue DLIMP_loaded_objects = TYPE_QUEUE_INITIALIZER;
+
+/*---------------------------------------------------------------------------*/
+/* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded */
+/* when client asks to load a dynamic executable or library. Note that */
+/* dependents that have already been loaded with another module will not */
+/* appear on this queue. */
+/*---------------------------------------------------------------------------*/
+TYPE_STACK_IMPLEMENTATION(DLIMP_Dynamic_Module*, dynamic_module_ptr)
+dynamic_module_ptr_Stack DLIMP_dependency_stack = TYPE_STACK_INITIALIZER;
+
+/*---------------------------------------------------------------------------*/
+/* Current virtual target set after reading the file headers. This is used */
+/* to access target specific functions. */
+/*---------------------------------------------------------------------------*/
+VIRTUAL_TARGET *cur_target = NULL;
+
+/*---------------------------------------------------------------------------*/
+/* Support for profiling performance of dynamic loader core. */
+/*---------------------------------------------------------------------------*/
+#if LOADER_DEBUG
+static clock_t cycle0 = 0;
+static clock_t cycle_end = 0;
+#define profile_start_clock() (cycle0 = clock())
+#define profile_stop_clock() (cycle_end = clock())
+#define profile_cycle_count() (cycle_end - cycle0)
+#endif
+
+/*---------------------------------------------------------------------------*/
+/* The dynamic loader will now create a table TI_init_table to store */
+/* pre-init and init data. This is done because pre-init and */
+/* init functions could reference as-yet unrelocated symbols from other */
+/* modules. As such it is safer to store relevant function addresses and */
+/* execute them only after all modules are relocated. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_IMPLEMENTATION(IF_single_record*, IF_table)
+IF_table_Queue TI_init_table = TYPE_QUEUE_INITIALIZER;
+
+static VIRTUAL_TARGET *get_vt_obj(int given_id);
+static void read_args_from_section(DLIMP_Loaded_Module* ep_module);
+static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz);
+static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle,
+ int argc, char** argv,
+ DLIMP_Loaded_Module *ep_module);
+
+/*****************************************************************************/
+/* DLOAD_create() */
+/* */
+/* Create an instance of the dynamic loader core. */
+/* */
+/* client_handle: Private client token to be returned during select DLIF */
+/* function calls. */
+/* */
+/* returns: an opaque DLOAD core loader handle, identifying this instance.*/
+/* */
+/*****************************************************************************/
+DLOAD_HANDLE DLOAD_create(void *client_handle)
+{
+ LOADER_OBJECT *pLoaderObject = DLIF_malloc(sizeof(LOADER_OBJECT));
+
+ /*-----------------------------------------------------------------------*/
+ /* Fill out the Loader Object: */
+ /*-----------------------------------------------------------------------*/
+ /* Set up initial objects_loading queue. */
+ /*-----------------------------------------------------------------------*/
+ AL_initialize(&(pLoaderObject->DLIMP_module_dependency_list),
+ sizeof (const char*), 1);
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize Loaded Module Ptr Queue */
+ /*-----------------------------------------------------------------------*/
+ loaded_module_ptr_initialize_queue(&pLoaderObject->DLIMP_loaded_objects);
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize Dynamic Module Ptr Stack */
+ /*-----------------------------------------------------------------------*/
+ dynamic_module_ptr_initialize_stack(&pLoaderObject->DLIMP_dependency_stack);
+
+ pLoaderObject->file_handle = 1;
+
+ /*-----------------------------------------------------------------------*/
+ /* Store client token, so it can be handed back during DLIF calls */
+ /*-----------------------------------------------------------------------*/
+ pLoaderObject->client_handle = client_handle;
+
+ return((DLOAD_HANDLE)pLoaderObject);
+}
+
+/*****************************************************************************/
+/* DLOAD_destroy() */
+/* */
+/* Remove an instance of the dynamic loader core, and free all resources */
+/* allocated during DLOAD_create(). */
+/* */
+/* client_handle: Private client token to be returned during select DLIF */
+/* function calls. */
+/* Preconditions: 1) handle must be valid. */
+/* 2) Loader instance must be in "UNLOADED" state. */
+/* */
+/*****************************************************************************/
+void DLOAD_destroy(DLOAD_HANDLE handle)
+{
+ LOADER_OBJECT * pLoaderObject;
+
+ pLoaderObject = (LOADER_OBJECT *)handle;
+ AL_destroy(&(pLoaderObject->DLIMP_module_dependency_list));
+
+ /*--------------------------*/
+ /* Free the instance object */
+ /*--------------------------*/
+ DLIF_free (pLoaderObject);
+}
+
+/*****************************************************************************/
+/* DLIMP_get_first_dyntag() */
+/* */
+/* Return value for first tag entry in the given dynamic table whose */
+/* tag type matches the given key. */
+/* */
+/*****************************************************************************/
+uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through dynamic segment looking for a specific dynamic tag. */
+ /* Return the value associated with the tag, if the tag is found. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Dyn *dtp = dyn_table;
+
+ while (dtp->d_tag != DT_NULL)
+ {
+ if (dtp->d_tag == tag) return dtp->d_un.d_val;
+ else dtp++;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Tag wasn't found, return a known bogus value for the tag. */
+ /*------------------------------------------------------------------------*/
+ return INT_MAX;
+}
+
+/*****************************************************************************/
+/* dload_and_allocate_dependencies() */
+/* */
+/* If not already loaded, load each dependent file identified in the */
+/* dynamic segment with a DT_NEEDED tag. Dependent files are listed in */
+/* order and should be loaded in the same order that they appear in the */
+/* dynamic segment. */
+/* */
+/*****************************************************************************/
+static BOOL dload_and_allocate_dependencies( DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through each dynamic tag entry in the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Dyn* dyn_nugget = dyn_module->dyntab;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Starting dload_and_allocate_dependencies() for %s ...\n",
+ dyn_module->name);
+#endif
+
+ while(dyn_nugget->d_tag != DT_NULL)
+ {
+ /*---------------------------------------------------------------------*/
+ /* For each DT_NEEDED dynamic tag that we find in the dynamic segment, */
+ /* load the dependent file identified by the so_name value attached */
+ /* to the DT_NEEDED dynamic tag. */
+ /*---------------------------------------------------------------------*/
+ if (dyn_nugget->d_tag == DT_NEEDED)
+ {
+ loaded_module_ptr_Queue_Node* ptr;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found DT_NEEDED: %s\n",
+ dyn_module->strtab+dyn_nugget->d_un.d_val);
+#endif
+
+ /*------------------------------------------------------------------*/
+ /* Find out if the file named by the DT_NEEDED tag has already */
+ /* been loaded. If it has, then we only have to bump the use count */
+ /* of the named dependent file. */
+ /*------------------------------------------------------------------*/
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+
+
+ if (!strcmp(ptr->value->name,
+ dyn_module->strtab + dyn_nugget->d_un.d_val))
+ {
+ ptr->value->use_count++;
+ AL_append(&(dyn_module->loaded_module->dependencies),
+ &(ptr->value->file_handle));
+ break;
+ }
+ }
+
+ /*------------------------------------------------------------------*/
+ /* If the named dependent file has not been loaded, then we ask the */
+ /* client to invoke a load of the dependent file on our behalf. */
+ /*------------------------------------------------------------------*/
+ if (ptr == NULL)
+ {
+ int32_t dependent_handle = DLIF_load_dependent(
+ pHandle->client_handle,
+ dyn_module->strtab +
+ dyn_nugget->d_un.d_val);
+ AL_append(&(dyn_module->loaded_module->dependencies),
+ &dependent_handle);
+ if (dependent_handle == 0) return FALSE;
+ }
+ }
+
+ dyn_nugget++;
+ }
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Finished dload_and_allocate_dependencies() for %s\n",
+ dyn_module->name);
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* load_object() */
+/* */
+/* Finish the process of loading an object file. */
+/* */
+/*****************************************************************************/
+static int load_object(LOADER_FILE_DESC *fd, DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* With the dynamic loader already running on the target, we are able to */
+ /* relocate directly into target memory, so there is nothing more to be */
+ /* done (at least in the bare-metal dynamic linking ABI model). */
+ /*------------------------------------------------------------------------*/
+ return 1;
+}
+
+/*****************************************************************************/
+/* write_arguments_to_args_section() */
+/* */
+/* Write argv and argc to .args section. */
+/* */
+/*****************************************************************************/
+static BOOL write_arguments_to_args_section(DLOAD_HANDLE handle,
+ int argc, char** argv,
+ DLIMP_Loaded_Module *ep_module)
+{
+ int mem_inc = MEM_INC;
+ int ptr_sz = PTR_SZ;
+ int p_size = ptr_sz / mem_inc;
+ int i_size = T_INTSZ / mem_inc;
+ int c_size = T_CHARSZ /mem_inc;
+ int argv_offset = 0;
+ int str_offset = 0;
+ int size = 0;
+ int arg;
+ int *targ_argv_pointers = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ uint8_t *c_args = NULL;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Write_arguments_to_args_section:\n");
+#endif
+
+ /*-----------------------------------------------------------------------*/
+ /* IF NO ARGUMENTS, ABORT QUIETLY, WITH a SUCCESSFUL CODE. */
+ /*-----------------------------------------------------------------------*/
+ if (argc == 0) return TRUE;
+
+ /*-----------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there */
+ /* is one. This is stored in the Loaded Module, and must have a */
+ /* legitimate address. If not, abort with Warning. */
+ /*-----------------------------------------------------------------------*/
+ c_args = ep_module->c_args;
+ if (!c_args || c_args == (uint8_t *)0xFFFFFFFF)
+ {
+ DLIF_warning(DLWT_MISC, "__c_args__ does not have valid value.\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* WE OUGHT TO WORRY ABOUT ALIGNMENT: IF SECTION ISN'T PROPERLY ALIGNED, */
+ /* ABORT THE PROCESSING OF ARGUMENTS WITH A NICE ERROR MESSAGE. */
+ /*-----------------------------------------------------------------------*/
+ if (c_args && ((Elf32_Addr)c_args & (MAX(p_size, i_size) - 1)))
+ {
+ DLIF_warning(DLWT_MISC, ".args section not properly aligned\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* CALCULATE OFFSET IN TABLE WHERE ARGV AND THE STRINGS WILL BE STORED. */
+ /* NOTE THAT argv MAY NEED MORE ALIGNMENT THAN AN INTEGER, SO ITS OFFSET */
+ /* IS REALLY THE MAXIMUM OF A POINTER SIZE AND INTEGER SIZE. ALSO NOTE */
+ /* WE NEED TO ALLOCATE AN EXTRA POINTER FOR argv[argc]. */
+ /*-----------------------------------------------------------------------*/
+ argv_offset = MAX(p_size, i_size);
+ str_offset = argv_offset + (argc * p_size) + p_size ;
+
+ /*-----------------------------------------------------------------------*/
+ /* CALCULATE SPACE REQUIRED FOR WRITING OUT .args SECTION. CHECK IF THE */
+ /* SEGMENT HAS ENOUGH SPACE AVAILABLE. IF NOT, RETURN WITH ERROR CODE. */
+ /*-----------------------------------------------------------------------*/
+ size = str_offset;
+
+ for (arg = 0; arg < argc; arg++)
+ size += (c_size * (strlen(argv[arg]) + 1));
+
+ if (!seg_has_space_for_write(ep_module, size))
+ {
+ DLIF_warning(DLWT_MISC,
+ "Segment has insufficient space for .args contents\n");
+ return FALSE;
+ }
+
+ /*-----------------------------------------------------------------------*/
+ /* OVERALL, WE NEED TO CREATE A TARGET IMAGE THAT CORRESPONDS TO: */
+ /* int argc; */
+ /* char *argv[argc]; */
+ /* <strings pointed to by argv> */
+ /* So say, for C6x, for "-v -d", we would need 22 bytes: */
+ /* 4 bytes // argc */
+ /* 4 bytes // argv[0] pointer value */
+ /* 4 bytes // argv[1] pointer value */
+ /* 4 bytes // argv[argc] end of pointer value array, normally 0 */
+ /* 3 bytes // "-v" */
+ /* 3 bytes // "-d" */
+ /*-----------------------------------------------------------------------*/
+
+ /*-----------------------------------------------------------------------*/
+ /* FIRST WRITE OUT ARGC. */
+ /*-----------------------------------------------------------------------*/
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ i_size, (uint32_t) &argc, (uint32_t) c_args);
+#endif
+
+ DLIF_memcpy(pHandle->client_handle, c_args, &argc, i_size);
+
+ /*-----------------------------------------------------------------------*/
+ /* CREATE AN INTERNAL ARRAY OF ARGV POINTER VALUES, THEN WRITE THEM OUT */
+ /*-----------------------------------------------------------------------*/
+ targ_argv_pointers = (int *)DLIF_malloc((argc + 1) * sizeof(int));
+ for (arg = 0; arg < argc ; arg++)
+ {
+ targ_argv_pointers[arg] = (int)(str_offset + c_args);
+ str_offset += (strlen(argv[arg]) + 1) * c_size;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("\t\ttarg_argv_pointers[%d] : 0x%x\n",
+ arg, targ_argv_pointers[arg]);
+#endif
+ }
+
+ targ_argv_pointers[argc] = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* WRITE OUT THIS INTERNAL ARRAY OF ARGV POINTER VALUES */
+ /*-----------------------------------------------------------------------*/
+ for (arg = 0; arg <= argc; arg++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ p_size, (uint32_t) &targ_argv_pointers[arg],
+ (uint32_t) (c_args + argv_offset));
+#endif
+ DLIF_memcpy(pHandle->client_handle,
+ (void *)(c_args + argv_offset),
+ &targ_argv_pointers[arg],
+ p_size);
+ argv_offset += p_size;
+ }
+
+#if LOADER_DEBUG
+if (debugging_on)
+{
+ DLIF_trace ("\t\targv being copied : 0x%x\n",(uint32_t)argv);
+ for (arg = 0; arg < argc; arg++)
+ {
+ DLIF_trace ("\t\t---\n\t\t&argv[%d] being copied : 0x%x\n", arg,
+ (uint32_t)&argv[arg]);
+ DLIF_trace ("\t\targv[%d] being copied : 0x%x\n",arg,
+ (uint32_t)argv[arg]);
+ DLIF_trace ("\t\targv[%d] being copied : %s\n",arg, (char *)argv[arg]);
+ }
+}
+#endif
+
+ /*-----------------------------------------------------------------------*/
+ /* LASTLY WRITE OUT ALL THE STRINGS. */
+ /*-----------------------------------------------------------------------*/
+ for (arg = 0; arg < argc; arg++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace ("-- Copy %d bytes from 0x%x to 0x%x\n",
+ (uint32_t)strlen(argv[arg]) + 1,
+ (uint32_t)&argv[arg],
+ (uint32_t)(targ_argv_pointers[arg]));
+#endif
+ DLIF_memcpy(pHandle->client_handle,
+ (void *)(targ_argv_pointers[arg]),
+ argv[arg],
+ strlen(argv[arg]) + 1);
+ }
+
+ return TRUE;
+}
+
+
+/*****************************************************************************/
+/* initialize_loaded_module() */
+/* */
+/* Initialize DLIMP_Loaded_Module internal data object associated with a */
+/* dynamic module. This function will also set up a queue of */
+/* DLIMP_Loaded_Segment(s) associated with the loaded module. */
+/* This function is called as we are getting ready to actually load the */
+/* object file contents into target memory. Each segment will get a */
+/* target memory request that it can use to ask the client for target */
+/* memory space. This function will also assign a file handle to the */
+/* loaded module. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* In applications that use the DSBT model, this function will also need to */
+/* negotiate the module's DSBT index with the client. */
+/* */
+/*****************************************************************************/
+static void initialize_loaded_module(DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Allocate a DLIMP_Loaded_Module data structure for the specified ELF */
+ /* file and assign a file handle for it (bumping the file handle counter */
+ /* as we go). */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *loaded_module =
+ dyn_module->loaded_module = DLIF_malloc(sizeof(DLIMP_Loaded_Module));
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Start clock on initialization of loaded module object. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Starting initialize_loaded_module() ...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ if (dyn_module->name)
+ {
+ loaded_module->name = DLIF_malloc(strlen(dyn_module->name) + 1);
+ strcpy(loaded_module->name, dyn_module->name);
+ }
+ else
+ loaded_module->name = "<unknown>";
+
+ loaded_module->file_handle = pHandle->file_handle++;
+ loaded_module->direct_dependent_only = dyn_module->direct_dependent_only;
+ loaded_module->use_count = 1;
+
+ /*------------------------------------------------------------------------*/
+ /* In case we wrapped around the file handle, return error. */
+ /*------------------------------------------------------------------------*/
+ if (pHandle->file_handle == 0)
+ DLIF_error(DLET_MISC, "DLOAD File handle overflowed.\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Initially the loaded module does not have access to its global */
+ /* symbols. These need to be copied from the dynamic module (see call */
+ /* to DLSYM_copy_globals() below). */
+ /* */
+ /* THESE INITIALIZATIONS SHOULD BE MOVED TO AN INIT ROUTINE FOR THE */
+ /* LOADED MODULE */
+ /*------------------------------------------------------------------------*/
+ loaded_module->gsymtab = NULL;
+ loaded_module->gstrtab = NULL;
+ loaded_module->gsymnum = loaded_module->gstrsz = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the Array_List of dependencies. */
+ /*------------------------------------------------------------------------*/
+ AL_initialize(&(loaded_module->dependencies), sizeof(int), 1);
+
+ if (dyn_module->symtab)
+ DLSYM_copy_globals(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the module loaded segments Array_List. */
+ /*------------------------------------------------------------------------*/
+ AL_initialize(&(loaded_module->loaded_segments),
+ sizeof(DLIMP_Loaded_Segment), dyn_module->phnum);
+
+ /*------------------------------------------------------------------------*/
+ /* Spin thru segment headers and process each load segment encountered. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->phnum; i++)
+ if (dyn_module->phdr[i].p_type == PT_LOAD)
+ {
+ /*------------------------------------------------------------------*/
+ /* Note that this is parallel to and does not supplant the ELF */
+ /* phdr tables. */
+ /*------------------------------------------------------------------*/
+ DLIMP_Loaded_Segment seg;
+ seg.obj_desc = DLIF_malloc(sizeof(struct DLOAD_MEMORY_SEGMENT));
+ seg.phdr.p_vaddr = dyn_module->phdr[i].p_vaddr;
+ seg.phdr.p_offset = dyn_module->phdr[i].p_offset;
+ seg.obj_desc->target_page = 0; /*not used*/
+ seg.modified = 0;
+ seg.phdr.p_filesz = seg.obj_desc->objsz_in_bytes
+ = dyn_module->phdr[i].p_filesz;
+ seg.phdr.p_memsz = seg.obj_desc->memsz_in_bytes
+ = dyn_module->phdr[i].p_memsz;
+ seg.phdr.p_align = dyn_module->phdr[i].p_align;
+ seg.phdr.p_flags = dyn_module->phdr[i].p_flags;
+ AL_append(&(loaded_module->loaded_segments), &seg);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize the DSO termination information for this module. */
+ /* It will be copied over from the enclosing dyn_module object when */
+ /* placement is completed and dyn_module's local copy of the dynamic */
+ /* table is updated. */
+ /*------------------------------------------------------------------------*/
+ loaded_module->fini_array = (Elf32_Addr) NULL;
+ loaded_module->fini_arraysz = 0;
+ loaded_module->fini = (Elf32_Addr) NULL;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished initialize_loaded_module()\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long)profile_cycle_count());
+ }
+ }
+#endif
+
+}
+
+/*****************************************************************************/
+/* load_static_segment() */
+/* */
+/* The core dynamic loader requires that a statically linked executable */
+/* be placed in target memory at the location that was determined during */
+/* the static link that created the executable. Failure to get the */
+/* required target memory where the static executable is to be loaded */
+/* will cause the dynamic loader to emit an error and abort the load. */
+/* */
+/*****************************************************************************/
+static BOOL load_static_segment(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*)
+ (dyn_module->loaded_module->loaded_segments.buf);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* For each segment in the loaded module, build up a target memory */
+ /* request for the segment, get rights to target memory where we want */
+ /* to load the segment from the client, then get the client to write the */
+ /* segment contents out to target memory to the appropriate address. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ seg[i].obj_desc->target_page = 0;
+ targ_req.flags = 0;
+
+ /*---------------------------------------------------------------------*/
+ /* This is a static executable. DLIF_allocate should give us the */
+ /* address we ask for or fail. */
+ /*---------------------------------------------------------------------*/
+ if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+
+ targ_req.align = seg[i].phdr.p_align;
+ seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr;
+ targ_req.flags &= ~DLOAD_SF_relocatable;
+ targ_req.fp = fd;
+ targ_req.segment = seg[i].obj_desc;
+ targ_req.offset = seg[i].phdr.p_offset;
+ targ_req.flip_endian = dyn_module->wrong_endian;
+
+ /*---------------------------------------------------------------------*/
+ /* Ask the client side of the dynamic loader to allocate target memory */
+ /* for this segment to be loaded into. */
+ /*---------------------------------------------------------------------*/
+ if (!DLIF_allocate(pHandle->client_handle, &targ_req)) return FALSE;
+
+ /*---------------------------------------------------------------------*/
+ /* If there is any initialized data in the segment, we'll first write */
+ /* it into a host writable buffer (DLIF_copy()) and then flush it to */
+ /* target memory. */
+ /*---------------------------------------------------------------------*/
+ if (seg[i].phdr.p_filesz)
+ {
+ DLIF_copy(pHandle->client_handle, &targ_req);
+ DLIF_write(pHandle->client_handle, &targ_req);
+ }
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* relocate_target_dynamic_tag_info() */
+/* */
+/* Update a target specific dynamic tag value that happens to be a */
+/* virtual address of a section. Returns TRUE if the tag was updated or */
+/* is not a virtual address and FALSE if it was not successfully updated */
+/* or was not recognized. */
+/*****************************************************************************/
+static BOOL relocate_target_dynamic_tag_info(DLIMP_Dynamic_Module *dyn_module,
+ int i)
+{
+ return cur_target->relocate_dynamic_tag_info(dyn_module, i);
+}
+
+/*****************************************************************************/
+/* DLIMP_update_dyntag_section_address() */
+/* */
+/* Given the index of a dynamic tag which we happen to know points to a */
+/* section address, find the program header table entry associated with */
+/* the specified address and update the tag value with the real address */
+/* of the section. */
+/* */
+/*****************************************************************************/
+BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i)
+{
+ int j;
+ DLIMP_Loaded_Segment *seg = (DLIMP_Loaded_Segment *)
+ (dyn_module->loaded_module->loaded_segments.buf);
+
+ /*------------------------------------------------------------------------*/
+ /* If dynamic tag does not access an existing section, then no update */
+ /* is required. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->dyntab[i].d_un.d_ptr == (Elf32_Addr)0)
+ { return TRUE; }
+
+ for (j = 0; j < dyn_module->loaded_module->loaded_segments.size; j++)
+ {
+ if ((dyn_module->dyntab[i].d_un.d_ptr >= seg[j].input_vaddr) &&
+ (dyn_module->dyntab[i].d_un.d_ptr <
+ (seg[j].input_vaddr + seg[j].phdr.p_memsz)))
+ {
+ dyn_module->dyntab[i].d_un.d_ptr +=
+ (seg[j].phdr.p_vaddr - seg[j].input_vaddr);
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* relocate_dynamic_tag_info() */
+/* */
+/* Once segment allocation has been completed, we'll need to go through */
+/* the dynamic table and update any tag values that happen to be virtual */
+/* addresses of segments (DT_C6000_DSBT_BASE, for example). */
+/* */
+/*****************************************************************************/
+static BOOL relocate_dynamic_tag_info(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through dynamic table loking for tags that have a value which is */
+ /* the virtual address of a section. After the sections are allocated, */
+ /* we'll need to update these values with the new address of the section. */
+ /*------------------------------------------------------------------------*/
+ int i;
+ for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++)
+ {
+ switch (dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* Only tag values that are virtual addresses will be affected. */
+ /*------------------------------------------------------------------*/
+ case DT_NEEDED:
+ case DT_PLTRELSZ:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_RELASZ:
+ case DT_RELAENT:
+ case DT_STRSZ:
+ case DT_SYMENT:
+ case DT_SONAME:
+ case DT_RPATH:
+ case DT_SYMBOLIC:
+ case DT_REL:
+ case DT_RELSZ:
+ case DT_RELENT:
+ case DT_PLTREL:
+ case DT_DEBUG:
+ case DT_TEXTREL:
+ case DT_BIND_NOW:
+ case DT_INIT_ARRAYSZ:
+ case DT_RUNPATH:
+ case DT_FLAGS:
+ case DT_PREINIT_ARRAYSZ:
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* NOTE!!! */
+ /* case DT_ENCODING: -- tag type has same "id" as DT_PREINIT_ARRAY */
+ /*------------------------------------------------------------------*/
+
+ /*------------------------------------------------------------------*/
+ /* This is a generic dynamic tag whose value is a virtual address */
+ /* of a section. It needs to be relocated to the section's actual */
+ /* address in target memory. */
+ /*------------------------------------------------------------------*/
+ case DT_PREINIT_ARRAY:
+ case DT_INIT:
+ case DT_INIT_ARRAY:
+ if (!DLIMP_update_dyntag_section_address(dyn_module, i))
+ return FALSE;
+
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Once we have resolved the actual address of termination function */
+ /* sections, we need to copy their addresses over to the loaded */
+ /* module object (dyn_module will be deleted before we get to */
+ /* unloading the module). */
+ /*------------------------------------------------------------------*/
+ case DT_FINI_ARRAY:
+ case DT_FINI:
+ if (!DLIMP_update_dyntag_section_address(dyn_module, i))
+ return FALSE;
+
+ if (dyn_module->dyntab[i].d_tag == DT_FINI)
+ dyn_module->loaded_module->fini =
+ dyn_module->dyntab[i].d_un.d_ptr;
+ else
+ dyn_module->loaded_module->fini_array =
+ dyn_module->dyntab[i].d_un.d_ptr;
+
+ continue;
+
+ case DT_FINI_ARRAYSZ:
+ dyn_module->loaded_module->fini_arraysz =
+ dyn_module->dyntab[i].d_un.d_val;
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* Is this a virtual address??? */
+ /*------------------------------------------------------------------*/
+ case DT_JMPREL: /* is this a virtual address??? */
+ continue;
+
+ /*------------------------------------------------------------------*/
+ /* The remaining dynamic tag types should be target specific. If */
+ /* something generic slips through to here, then the handler for */
+ /* relocating target specific dynamic tags should fail. */
+ /*------------------------------------------------------------------*/
+ default:
+ if (!relocate_target_dynamic_tag_info(dyn_module, i))
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We've gotten through all of the dynamic table without incident. */
+ /* All dynamic tag values that were virtual section addresses should have */
+ /* been updated with the final address of the section that they point to. */
+ /*------------------------------------------------------------------------*/
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* allocate_dynamic_segments_and relocate_symbols() */
+/* */
+/* Allocate target memory for each segment in this module, getting a */
+/* host-accessible space to copy the content of each segment into. Then */
+/* update the symbol table and program header table to reflect the new */
+/* target address for each segment. Processing of the dynamic relocation */
+/* entries will wait until all dependent files have been loaded and */
+/* allocated into target memory. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* The relocation entries in the ELF file do not handle the necessary */
+/* adjustments to the memory addresses in the program header or symbol */
+/* tables. These must be done manually. */
+/* */
+/* This is harder for us than for most dynamic loaders, because we have to */
+/* work in environments without virtual memory and thus where the offsets */
+/* between segments in memory may be different than they were in the file. */
+/* So, even though a dynamic loader usually only has to adjust all the */
+/* segments by a single fixed offset, we need to offset the symbols and */
+/* program header addresses segment by segment. This job is done by the */
+/* function below. */
+/* */
+/*****************************************************************************/
+static BOOL allocate_dynamic_segments_and_relocate_symbols
+ (DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i,j;
+ DLIMP_Loaded_Segment* seg = (DLIMP_Loaded_Segment*)
+ (dyn_module->loaded_module->loaded_segments.buf);
+ struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Dynamic executable found.\n"
+ "Starting allocate_dynamic_segments_and_relocate_symbols()"
+ "...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through the list of loaded segments from the current module. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->loaded_module->loaded_segments.size; i++)
+ {
+ /*--------------------------------------------------------------------*/
+ /* Allocate target memory for segment via client-provided target */
+ /* memory API. */
+ /*--------------------------------------------------------------------*/
+ int32_t addr_offset;
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ seg[i].obj_desc->target_page = 0;
+ targ_req.flags = 0;
+ if (seg[i].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[i].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = 0x20;
+ seg[i].obj_desc->target_address = (TARGET_ADDRESS)seg[i].phdr.p_vaddr;
+ targ_req.flags |= DLOAD_SF_relocatable;
+ targ_req.fp = fd;
+ targ_req.segment = seg[i].obj_desc;
+ targ_req.offset = seg[i].phdr.p_offset;
+ targ_req.flip_endian = dyn_module->wrong_endian;
+
+ if (!DLIF_allocate(pHandle->client_handle, &targ_req))
+ {
+ DLIF_error(DLET_MEMORY, "DLIF allocation failure.\n");
+ return FALSE;
+ }
+
+ /*--------------------------------------------------------------------*/
+ /* Calculate the offset we need to adjust segment header and symbol */
+ /* table addresses. */
+ /*--------------------------------------------------------------------*/
+ addr_offset = (int32_t)(seg[i].obj_desc->target_address) -
+ (int32_t)(seg[i].phdr.p_vaddr);
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Segment %d (at 0x%x, 0x%x bytes) relocated to 0x%x\n", i,
+ (int32_t)(seg[i].phdr.p_vaddr),
+ (int32_t)(seg[i].phdr.p_memsz),
+ (int32_t)(seg[i].obj_desc->target_address));
+ DLIF_trace("Addr Offset is 0x%x\n", addr_offset);
+ }
+#endif
+
+ /*--------------------------------------------------------------------*/
+ /* Update program entry point if needed. Need to replace to deal */
+ /* with full ELF initialization routine. */
+ /*--------------------------------------------------------------------*/
+ if (dyn_module->relocate_entry_point &&
+ fhdr->e_entry >= (Elf32_Addr)(seg[i].phdr.p_vaddr) &&
+ fhdr->e_entry <
+ (Elf32_Addr)((uint8_t*)(seg[i].phdr.p_vaddr) +
+ (uint32_t)(seg[i].phdr.p_memsz)))
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Entry point 0x%x relocated to 0x%x\n",
+ fhdr->e_entry, fhdr->e_entry + addr_offset);
+ }
+#endif
+ fhdr->e_entry += addr_offset;
+
+ /*------------------------------------------------------------------*/
+ /* Mark the entry point as being relocated so we will not do it */
+ /* again. */
+ /*------------------------------------------------------------------*/
+ dyn_module->relocate_entry_point = FALSE;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Fix program header entries in segment and Elf32_Phdr structs. */
+ /*---------------------------------------------------------------------*/
+ for (j = 0; j < fhdr->e_phnum; j++)
+ if (dyn_module->phdr[j].p_vaddr == (Elf32_Addr)seg[i].phdr.p_vaddr)
+ {
+ dyn_module->phdr[j].p_vaddr += addr_offset;
+ dyn_module->phdr[i].p_paddr += addr_offset;
+ break;
+ }
+
+ seg[i].input_vaddr = (Elf32_Addr)(seg[i].phdr.p_vaddr);
+ seg[i].phdr.p_vaddr += addr_offset;
+
+ /*---------------------------------------------------------------------*/
+ /* Great, now the hard part: fix offsets in symbols. It would be nice */
+ /* if there were an easier way to deal with this. */
+ /*---------------------------------------------------------------------*/
+ {
+ struct Elf32_Sym *gsymtab =
+ ((struct Elf32_Sym*)(dyn_module->loaded_module->gsymtab));
+ Elf32_Addr segment_start = (Elf32_Addr)seg[i].phdr.p_vaddr;
+ Elf32_Addr segment_end = (Elf32_Addr)seg[i].phdr.p_vaddr +
+ seg[i].phdr.p_memsz;
+ Elf32_Word global_index = dyn_module->symnum -
+ dyn_module->loaded_module->gsymnum;
+
+ for (j = 0; j < dyn_module->symnum; j++)
+ {
+ /*---------------------------------------------------------------*/
+ /* Get the relocated symbol value. */
+ /*---------------------------------------------------------------*/
+ Elf32_Addr symval_adj = dyn_module->symtab[j].st_value +
+ addr_offset;
+
+ /*---------------------------------------------------------------*/
+ /* If the symbol is defined in this segment, update the symbol */
+ /* value and mark the symbol so that we don't relocate it again. */
+ /*---------------------------------------------------------------*/
+ if (symval_adj >= segment_start && symval_adj < segment_end &&
+ dyn_module->symtab[j].st_shndx != INT16_MAX)
+ {
+ dyn_module->symtab[j].st_value = symval_adj;
+
+ /*------------------------------------------------------------*/
+ /* The module symbol table only has the global symbols. */
+ /*------------------------------------------------------------*/
+ if (j >= global_index)
+ gsymtab[j-global_index].st_value = symval_adj;
+
+ /*------------------------------------------------------------*/
+ /* Mark the symbol as relocated. */
+ /*------------------------------------------------------------*/
+ dyn_module->symtab[j].st_shndx = INT16_MAX;
+ }
+ }
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Update dynamic tag information. Some dynamic tags have values which */
+ /* are virtual addresses of sections. These values need to be updated */
+ /* once segment allocation is completed and the new segment addresses are */
+ /* known. */
+ /*------------------------------------------------------------------------*/
+ /* We should only traverse through the dynamic table once because we want */
+ /* to avoid the possibility of updating the same tag multiple times (an */
+ /* error, if it happens). */
+ /*------------------------------------------------------------------------*/
+ if (!relocate_dynamic_tag_info(fd, dyn_module))
+ {
+ DLIF_error(DLET_MISC, "Failed dynamic table update.\n");
+ return FALSE;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished allocate_dynamic_segments_and_relocate_symbols()\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n", (unsigned long) profile_cycle_count());
+ }
+ }
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* delete_DLIMP_Loaded_Module() */
+/* */
+/* Free host memory associated with a DLIMP_Loaded_Module data structure */
+/* and all of the DLIMP_Loaded_Segment objects that are associated with */
+/* it. */
+/* */
+/*****************************************************************************/
+static void delete_DLIMP_Loaded_Module(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Module **pplm)
+{
+ DLIMP_Loaded_Module *loaded_module = *pplm;
+ DLIMP_Loaded_Segment *segments = (DLIMP_Loaded_Segment*)
+ (loaded_module->loaded_segments.buf);
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*-----------------------------------------------------------------------*/
+ /* Spin through the segments attached to this loaded module, freeing up */
+ /* any target memory that was allocated by the client for the segment. */
+ /*-----------------------------------------------------------------------*/
+ int i;
+ for (i = 0; i < loaded_module->loaded_segments.size; i++)
+ {
+ if (!DLIF_release(pHandle->client_handle, segments[i].obj_desc))
+ DLIF_error(DLET_MISC, "Failed call to DLIF_release!\n");;
+ DLIF_free(segments[i].obj_desc);
+ }
+
+ /*----------------------------------------------------------------------*/
+ /* Hacky way of indicating that the base image is no longer available. */
+ /* WHHHHAAAAAAATTT!?!?!?!?!?! */
+ /*----------------------------------------------------------------------*/
+ if (loaded_module->file_handle == DLIMP_application_handle)
+ DLIMP_application_handle = 0;
+
+ /*-----------------------------------------------------------------------*/
+ /* Free host heap memory that was allocated for the internal loaded */
+ /* module data structure members. */
+ /*-----------------------------------------------------------------------*/
+ if (loaded_module->name) DLIF_free(loaded_module->name);
+ if (loaded_module->gsymtab) DLIF_free(loaded_module->gsymtab);
+ loaded_module->gsymnum = 0;
+ if (loaded_module->gstrtab) DLIF_free(loaded_module->gstrtab);
+ loaded_module->gstrsz = 0;
+ AL_destroy(&(loaded_module->loaded_segments));
+ AL_destroy(&(loaded_module->dependencies));
+
+ /*-----------------------------------------------------------------------*/
+ /* Finally, free the host memory for the loaded module object, then NULL */
+ /* the pointer that was passed in. */
+ /*-----------------------------------------------------------------------*/
+ DLIF_free(loaded_module);
+ *pplm = NULL;
+}
+
+/*****************************************************************************/
+/* new_DLIMP_Dynamic_Module() */
+/* */
+/* Allocate a dynamic module data structure from host memory and */
+/* initialize its members to their default values. */
+/* */
+/*****************************************************************************/
+static DLIMP_Dynamic_Module *new_DLIMP_Dynamic_Module(LOADER_FILE_DESC *fd)
+{
+ /*-----------------------------------------------------------------------*/
+ /* Allocate space for dynamic module data structure from host memory. */
+ /*-----------------------------------------------------------------------*/
+ DLIMP_Dynamic_Module *dyn_module =
+ (DLIMP_Dynamic_Module *)DLIF_malloc(sizeof(DLIMP_Dynamic_Module));
+
+ /*-----------------------------------------------------------------------*/
+ /* Initialize data members of the new dynamic module data structure. */
+ /*-----------------------------------------------------------------------*/
+ dyn_module->name = NULL;
+ dyn_module->fd = fd;
+ dyn_module->phdr = NULL;
+ dyn_module->phnum = 0;
+ dyn_module->strtab = NULL;
+ dyn_module->strsz = 0;
+ dyn_module->dyntab = NULL;
+ dyn_module->symtab = NULL;
+ dyn_module->symnum = 0;
+ dyn_module->gsymtab_offset = 0;
+ dyn_module->gstrtab_offset = 0;
+ dyn_module->c_args = NULL;
+ dyn_module->argc = 0;
+ dyn_module->argv = NULL;
+ dyn_module->loaded_module = NULL;
+ dyn_module->wrong_endian = 0;
+ dyn_module->direct_dependent_only = TRUE;
+ dyn_module->relocatable = FALSE;
+ dyn_module->relocate_entry_point = TRUE;
+
+ dyn_module->dsbt_size = 0;
+ dyn_module->dsbt_index = DSBT_INDEX_INVALID;
+ dyn_module->dsbt_base_tagidx = -1;
+
+ dyn_module->preinit_array_idx = -1;
+ dyn_module->preinit_arraysz = 0;
+ dyn_module->init_idx = -1;
+ dyn_module->init_array_idx = -1;
+ dyn_module->init_arraysz = 0;
+
+ return dyn_module;
+}
+
+/*****************************************************************************/
+/* detach_loaded_module() */
+/* */
+/* Detach loaded module data structure from given dynamic module. When */
+/* an object file has been successfully loaded, the loader core will */
+/* detach the loaded module data structure from the dynamic module data */
+/* structure because the loaded module must continue to persist until is */
+/* is actually unloaded from target memory. If there is a problem with */
+/* the load, then the host memory associated with the loaded module will */
+/* be released as part of the destruction of the dynamic module. */
+/* */
+/*****************************************************************************/
+static
+DLIMP_Loaded_Module *detach_loaded_module(DLIMP_Dynamic_Module *dyn_module)
+{
+ if (dyn_module && dyn_module->loaded_module)
+ {
+ DLIMP_Loaded_Module *loaded_module = dyn_module->loaded_module;
+ dyn_module->loaded_module = NULL;
+ return loaded_module;
+ }
+
+ return NULL;
+}
+/*****************************************************************************/
+/* delete_DLIMP_Dynamic_Module() */
+/* */
+/* Remove local copies of the string table, symbol table, program header */
+/* table, and dynamic table. */
+/* */
+/*****************************************************************************/
+static void delete_DLIMP_Dynamic_Module(DLOAD_HANDLE handle,
+ DLIMP_Dynamic_Module **ppdm)
+{
+ DLIMP_Dynamic_Module *dyn_module = NULL;
+
+ if (!ppdm || (*ppdm == NULL))
+ {
+ DLIF_error(DLET_MISC,
+ "Internal Error: invalid argument to dynamic module "
+ "destructor function; aborting loader\n");
+ DLIF_exit(1);
+ }
+
+ dyn_module = *ppdm;
+ if (dyn_module->name) DLIF_free(dyn_module->name);
+ if (dyn_module->strtab) DLIF_free(dyn_module->strtab);
+ if (dyn_module->symtab) DLIF_free(dyn_module->symtab);
+ if (dyn_module->phdr) DLIF_free(dyn_module->phdr);
+ if (dyn_module->dyntab) DLIF_free(dyn_module->dyntab);
+
+ /*------------------------------------------------------------------------*/
+ /* If we left the loaded module attached to the dynamic module, then */
+ /* something must have gone wrong with the load. Remove the loaded */
+ /* module from the queue of loaded modules, if it is there. Then free */
+ /* the host memory allocated to the loaded module and its segments. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->loaded_module != NULL)
+ delete_DLIMP_Loaded_Module(handle, &(dyn_module->loaded_module));
+
+ /*------------------------------------------------------------------------*/
+ /* Finally, free the host memory for this dynamic module object and NULL */
+ /* the pointer to the object. */
+ /*------------------------------------------------------------------------*/
+ DLIF_free(dyn_module);
+ *ppdm = NULL;
+}
+
+/*****************************************************************************/
+/* file_header_magic_number_is_valid() */
+/* */
+/* Given an object file header, check the magic number to ensure that it */
+/* is an object file format that we recognize. This implementation of */
+/* the dynamic loader core will handle ELF object file format. */
+/* */
+/*****************************************************************************/
+static BOOL file_header_magic_number_is_valid(struct Elf32_Ehdr* header)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check for correct ELF magic numbers in file header. */
+ /*------------------------------------------------------------------------*/
+ if (!header->e_ident[EI_MAG0] == ELFMAG0 ||
+ !header->e_ident[EI_MAG1] == ELFMAG1 ||
+ !header->e_ident[EI_MAG2] == ELFMAG2 ||
+ !header->e_ident[EI_MAG3] == ELFMAG3)
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF magic number.\n");
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* file_header_machine_is_valid() */
+/* */
+/* Check if the machine specified in the file header is supported by the */
+/* loader. If the loader was compiled with support for all targets, */
+/* the machine will be initially set to EM_NONE. Once a module has been */
+/* loaded, all remaining modules must have the same machine value. */
+/*****************************************************************************/
+static int file_header_machine_is_valid(Elf32_Half e_machine)
+{
+ /*------------------------------------------------------------------------*/
+ /* Currently we support only ARM or C6x */
+ /*------------------------------------------------------------------------*/
+ switch(e_machine)
+ {
+#ifdef ARM_TARGET
+ case EM_ARM : return TRUE;
+#endif
+#ifdef C60_TARGET
+ case EM_TI_C6000 : return TRUE;
+#endif
+
+ default : return FALSE;
+ }
+}
+
+/*****************************************************************************/
+/* is_valid_elf_object_file() */
+/* */
+/* Check file size against anticipated end location of string table, */
+/* symbol table, program header tables, etc. If we anything untoward, */
+/* then we declare that the ELF file is corrupt and the load is aborted. */
+/* */
+/*****************************************************************************/
+static BOOL is_valid_elf_object_file(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ uint32_t fsz;
+ int i;
+
+ /*------------------------------------------------------------------------*/
+ /* Get file size. */
+ /*------------------------------------------------------------------------*/
+ DLIF_fseek(fd, 0, LOADER_SEEK_END);
+ fsz = DLIF_ftell(fd);
+
+ /*------------------------------------------------------------------------*/
+ /* Check for invalid table sizes (string table, symbol table, and */
+ /* program header tables). */
+ /*------------------------------------------------------------------------*/
+ if (!((dyn_module->strsz < fsz) &&
+ (dyn_module->symnum < fsz) &&
+ (dyn_module->phnum * sizeof(struct Elf32_Phdr)) < fsz))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF table bounds.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for null so_name string in file with dynamic information. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->dyntab && !strcmp(dyn_module->name, ""))
+ {
+ DLIF_error(DLET_MISC, "Dynamic file lacks SO_NAME identifier.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for invalid program header information. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < dyn_module->phnum; i++)
+ {
+ struct Elf32_Phdr* phdr = dyn_module->phdr + i;
+
+ /*---------------------------------------------------------------------*/
+ /* Sanity check for relative sizes of filesz and memsz. */
+ /*---------------------------------------------------------------------*/
+ if (!(phdr->p_type != PT_LOAD || phdr->p_filesz <= phdr->p_memsz))
+ {
+ DLIF_error(DLET_MISC,
+ "Invalid file or memory size for segment %d.\n", i);
+ return FALSE;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Check that segment file offset doesn't go off the end of the file. */
+ /*---------------------------------------------------------------------*/
+ if (!(phdr->p_offset + phdr->p_filesz < fsz))
+ {
+ DLIF_error(DLET_FILE,
+ "File location of segment %d is past the end of file.\n", i);
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check that a ET_DYN-type file is relocatable. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->fhdr.e_type == ET_DYN && !dyn_module->symtab) return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* All checks passed. */
+ /*------------------------------------------------------------------------*/
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* process_eiosabi() */
+/* */
+/* Check the EI_OSABI field to validate it and set any parameters based on */
+/* it. */
+/*****************************************************************************/
+static BOOL process_eiosabi(DLIMP_Dynamic_Module* dyn_module)
+{
+ return cur_target->process_eiosabi(dyn_module);
+}
+
+/*****************************************************************************/
+/* dload_file_header() */
+/* */
+/* Read ELF file header. Store critical information in the provided */
+/* DLIMP_Dynamic_Module record. Check file header for validity. */
+/* */
+/*****************************************************************************/
+static BOOL dload_file_header(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read ELF file header from given input file. */
+ /*------------------------------------------------------------------------*/
+ DLIF_fread(&(dyn_module->fhdr), sizeof(struct Elf32_Ehdr), 1, fd);
+
+ /*------------------------------------------------------------------------*/
+ /* Determine target vs. host endian-ness. Does header data need to be */
+ /* byte swapped? */
+ /*------------------------------------------------------------------------*/
+ dyn_module->wrong_endian =
+ (dyn_module->fhdr.e_ident[EI_DATA] != DLIMP_get_endian());
+
+ /*------------------------------------------------------------------------*/
+ /* Swap file header structures, if needed. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ DLIMP_change_ehdr_endian(&(dyn_module->fhdr));
+
+ /*------------------------------------------------------------------------*/
+ /* Write out magic ELF information for debug purposes. */
+ /*------------------------------------------------------------------------*/
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("ELF: %c%c%c\n", dyn_module->fhdr.e_ident[1],
+ dyn_module->fhdr.e_ident[2],
+ dyn_module->fhdr.e_ident[3]);
+ DLIF_trace("ELF file header entry point: %x\n",
+ dyn_module->fhdr.e_entry);
+ }
+#endif
+
+
+ /*------------------------------------------------------------------------*/
+ /* Verify magic numbers in ELF file header. */
+ /*------------------------------------------------------------------------*/
+ if (!file_header_magic_number_is_valid(&(dyn_module->fhdr)))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file header magic number.\n");
+ return FALSE;
+ }
+
+ if (!file_header_machine_is_valid(dyn_module->fhdr.e_machine))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file target machine.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Verify file is an executable or dynamic shared object or library. */
+ /*------------------------------------------------------------------------*/
+ if ((dyn_module->fhdr.e_type != ET_EXEC) &&
+ (dyn_module->fhdr.e_type != ET_DYN))
+ {
+ DLIF_error(DLET_FILE, "Invalid ELF file type.\n");
+ return FALSE;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Stop profiling clock when file header information has finished */
+ /* loading. Re-start clock on initialization of symbol table, and */
+ /* dynamic table pointers. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("done.\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long)profile_cycle_count());
+ profile_start_clock();
+ }
+ }
+#endif
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* dload_program_header_table() */
+/* */
+/* Make a local copy of the ELF object file's program header table in the */
+/* dynamic module data structure. */
+/* */
+/*****************************************************************************/
+static void dload_program_header_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read the program header tables from the object file. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Ehdr *fhdr = &(dyn_module->fhdr);
+ dyn_module->phdr = (struct Elf32_Phdr*)
+ (DLIF_malloc(fhdr->e_phnum * fhdr->e_phentsize));
+ DLIF_fseek(fd, fhdr->e_phoff, LOADER_SEEK_SET);
+ DLIF_fread(dyn_module->phdr, fhdr->e_phentsize, fhdr->e_phnum,fd);
+ dyn_module->phnum = fhdr->e_phnum;
+
+ /*------------------------------------------------------------------------*/
+ /* Byte swap the program header tables if the target endian-ness is not */
+ /* the same as the host endian-ness. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ {
+ int i;
+ for (i = 0; i < dyn_module->phnum; i++)
+ DLIMP_change_phdr_endian(dyn_module->phdr + i);
+ }
+}
+
+/*****************************************************************************/
+/* dload_headers() */
+/* */
+/* Read ELF object file header and program header table information into */
+/* the given dynamic module data structure. If the object file contains */
+/* dynamic information, read in the dynamic tags, dynamic symbol table, */
+/* and global string table. Check to make sure that we are not already */
+/* in the process of loading the module (circular dependencies), then */
+/* perform some level of sanity checking on the content of the file to */
+/* provide some assurance that the file is not corrupted. */
+/* */
+/*****************************************************************************/
+static BOOL dload_headers(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* More progress information. Start timing if profiling is enabled. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("\nReading file headers ...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Read file header information and check vs. expected ELF object file */
+ /* header content. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_file_header(fd, dyn_module))
+ return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* Read program header table information into the dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ dload_program_header_table(fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Once headers have been read in, use e_machine to set virtual target. */
+ /* This can then be used to access target specific functions. */
+ /*------------------------------------------------------------------------*/
+ cur_target = get_vt_obj(dyn_module->fhdr.e_machine);
+ if (!cur_target)
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* find_dynamic_segment() */
+/* */
+/* Find the dynamic segment in the given ELF object file, if there is */
+/* one. If the segment is found, then the segment ID output parameter */
+/* is set to the index of the dynamic segment in the program header */
+/* table. If the dynamic segment is not found, the dynamic module's */
+/* relocatable flag is set to FALSE, and return FALSE. */
+/* */
+/*****************************************************************************/
+static BOOL find_dynamic_segment(DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word *dyn_seg_idx)
+{
+ int i;
+
+ /*------------------------------------------------------------------------*/
+ /* We should have a valid dynamic module pointer and somewhere to put the */
+ /* dynamic segment id, if we find one. If either of these are missing, */
+ /* we should get an internal error and abort the loader. */
+ /*------------------------------------------------------------------------*/
+ if ((dyn_module == NULL) || (dyn_seg_idx == NULL))
+ {
+ DLIF_error(DLET_MISC, "Internal error: find_dynamic_segment() needs "
+ "non-NULL arguments.\n");
+ DLIF_exit(1);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Spin through segment program headers to find the dynamic segment. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->relocatable = TRUE;
+ for (i = 0; i < dyn_module->phnum; i++)
+ if (dyn_module->phdr[i].p_type == PT_DYNAMIC)
+ { *dyn_seg_idx = i; return TRUE; }
+
+ /*------------------------------------------------------------------------*/
+ /* No dynamic segment found, mark the object module as not relocatable */
+ /* and warn the user. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->relocatable = FALSE;
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* copy_dynamic_table() */
+/* */
+/* Make a local copy of the dynamic table read from the dynamic segment */
+/* in the ELF object file. */
+/* */
+/*****************************************************************************/
+static void copy_dynamic_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word dyn_seg_idx)
+{
+ /*------------------------------------------------------------------------*/
+ /* Allocate space for the dynamic table from host memory and read its */
+ /* content from the ELF object file. */
+ /*------------------------------------------------------------------------*/
+ Elf32_Word num_elem;
+ dyn_module->dyntab = DLIF_malloc(dyn_module->phdr[dyn_seg_idx].p_filesz);
+ num_elem = dyn_module->phdr[dyn_seg_idx].p_filesz / sizeof(struct Elf32_Dyn);
+ DLIF_fseek(fd, dyn_module->phdr[dyn_seg_idx].p_offset, LOADER_SEEK_SET);
+ DLIF_fread(dyn_module->dyntab, sizeof(struct Elf32_Dyn), num_elem, fd);
+
+ /*------------------------------------------------------------------------*/
+ /* If necessary, byte swap each entry in the dynamic table. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->wrong_endian)
+ {
+ int i;
+ for (i = 0; i < num_elem; i++)
+ DLIMP_change_dynent_endian(&dyn_module->dyntab[i]);
+ }
+}
+
+/*****************************************************************************/
+/* process_target_dynamic_tag() */
+/* */
+/* Process a target specific dynamic tag entry. Returns TRUE if the tag */
+/* was handled and FALSE if it was not recognized. */
+/*****************************************************************************/
+static BOOL process_target_dynamic_tag(DLIMP_Dynamic_Module* dyn_module, int i)
+{
+ return cur_target->process_dynamic_tag(dyn_module, i);
+}
+
+/*****************************************************************************/
+/* process_dynamic_table() */
+/* */
+/* Process dynamic tag entries from the dynamic table. At the conclusion */
+/* of this function, we should have made a copy of the global symbols */
+/* and the global symbol names. */
+/* */
+/*****************************************************************************/
+static BOOL process_dynamic_table(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int i;
+ BOOL soname_found = FALSE;
+ Elf32_Addr soname_offset = 0;
+ Elf32_Addr strtab_offset = 0;
+ Elf32_Addr hash_offset = 0;
+ Elf32_Addr symtab_offset = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Iterate over the dynamic table in order to process dynamic tags. */
+ /* See ELF TIS Specification for details on the meaning of each dynamic */
+ /* tag. The C6000 ELF ABI Specification provides more details about the */
+ /* TI specific C6000 ELF ABI tags. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; dyn_module->dyntab[i].d_tag != DT_NULL; i++)
+ {
+ switch(dyn_module->dyntab[i].d_tag)
+ {
+ /*------------------------------------------------------------------*/
+ /* DT_SONAME: Contains name of dynamic object, used for dependency */
+ /* comparisons. Its value is an offset from the start */
+ /* of the string table. We need to copy the string at */
+ /* this offset into dmodule->name. */
+ /*------------------------------------------------------------------*/
+ case DT_SONAME:
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Found SO_NAME.\n");
+#endif
+ /*---------------------------------------------------------------*/
+ /* We store the offset of the so_name in the dynamic string */
+ /* table so that it doesn't matter which dynamic tag we see */
+ /* first (DT_SONAME actually is generated before DT_STRTAB). */
+ /*---------------------------------------------------------------*/
+ soname_found = TRUE;
+ soname_offset = dyn_module->dyntab[i].d_un.d_ptr;
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_STRSZ: Contains the size of the string table. */
+ /*------------------------------------------------------------------*/
+ case DT_STRSZ:
+ dyn_module->strsz = dyn_module->dyntab[i].d_un.d_val;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found string table Size: 0x%x\n", dyn_module->strsz);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_STRTAB: Contains the file offset of the string table. The */
+ /* tag directly after this is guaranteed to be DT_STRSZ, */
+ /* containing the string table size. We need to */
+ /* allocate memory for the string table and copy it from */
+ /* the file. */
+ /*------------------------------------------------------------------*/
+ case DT_STRTAB:
+ strtab_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found string table: 0x%x\n", strtab_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_HASH: Contains the file offset of the symbol hash table. */
+ /*------------------------------------------------------------------*/
+ case DT_HASH:
+ hash_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found symbol hash table: 0x%x\n", hash_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DT_SYMTAB: Contains the file offset of the symbol table. */
+ /*------------------------------------------------------------------*/
+ case DT_SYMTAB:
+ symtab_offset = dyn_module->dyntab[i].d_un.d_ptr;
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Found symbol table: 0x%x\n", symtab_offset);
+#endif
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* DSO Initialization / Termination Model Dynamic Tags */
+ /*------------------------------------------------------------------*/
+ /* For initialization tags, we store indices and array sizes in */
+ /* the dyn_module. Termination works a little different, the */
+ /* indices into the local copy of the dynamic table are stored in */
+ /* dyn_module, but the DT_FINI_ARRAYSZ value is recorded with the */
+ /* loaded module. */
+ /*------------------------------------------------------------------*/
+ /* After placement is done, the DT_FINI and DT_FINI_ARRAY values */
+ /* need to be copied from the local dynamic table into the loaded */
+ /* module object. */
+ /*------------------------------------------------------------------*/
+ case DT_PREINIT_ARRAY:
+ dyn_module->preinit_array_idx = i;
+ break;
+
+ case DT_PREINIT_ARRAYSZ:
+ dyn_module->preinit_arraysz = dyn_module->dyntab[i].d_un.d_val;
+ break;
+
+ case DT_INIT:
+ dyn_module->init_idx = i;
+ break;
+
+ case DT_INIT_ARRAY:
+ dyn_module->init_array_idx = i;
+ break;
+
+ case DT_INIT_ARRAYSZ:
+ dyn_module->init_arraysz = dyn_module->dyntab[i].d_un.d_val;
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* This information will be copied over to the loaded module */
+ /* object after placement has been completed and the information */
+ /* in the dynamic table has been relocated. */
+ /*------------------------------------------------------------------*/
+ case DT_FINI_ARRAY:
+ case DT_FINI_ARRAYSZ:
+ case DT_FINI:
+ break;
+
+ /*------------------------------------------------------------------*/
+ /* Unrecognized tag, may not be illegal, but is not explicitly */
+ /* handled by this function. Should it be? */
+ /*------------------------------------------------------------------*/
+ default:
+ {
+ if (!process_target_dynamic_tag(dyn_module, i))
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Unrecognized dynamic tag: 0x%X\n",
+ dyn_module->dyntab[i].d_tag);
+#endif
+ }
+
+ break;
+ }
+
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If string table offset and size were found, read string table in from */
+ /* the ELF object file. */
+ /*------------------------------------------------------------------------*/
+ if (strtab_offset && dyn_module->strsz)
+ {
+ DLIF_fseek(fd, strtab_offset, LOADER_SEEK_SET);
+ dyn_module->strtab = DLIF_malloc(dyn_module->strsz);
+ DLIF_fread(dyn_module->strtab, sizeof(uint8_t), dyn_module->strsz, fd);
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC,
+ "Mandatory dynamic tag DT_STRTAB/DT_STRSZ not found!\n");
+ return FALSE;
+ }
+
+
+ /*------------------------------------------------------------------------*/
+ /* If symbol hash table is found read-in the hash table. */
+ /*------------------------------------------------------------------------*/
+ if (hash_offset)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Hash table has the following format. nchain equals the number of */
+ /* entries in the symbol table (symnum) */
+ /* */
+ /* +----------------------------+ */
+ /* | nbucket | */
+ /* +----------------------------+ */
+ /* | nchain | */
+ /* +----------------------------+ */
+ /* | bucket[0] | */
+ /* | ... | */
+ /* | bucket[nbucket-1] | */
+ /* +----------------------------+ */
+ /* | chain[0] | */
+ /* | ... | */
+ /* | chain[nchain-1] | */
+ /* +----------------------------+ */
+ /*---------------------------------------------------------------------*/
+ Elf32_Word hash_nbucket;
+ Elf32_Word hash_nchain;
+
+ /*---------------------------------------------------------------------*/
+ /* Seek to the hash offset and read first two words into nbucket and */
+ /* symnum. */
+ /*---------------------------------------------------------------------*/
+ DLIF_fseek(fd, hash_offset, LOADER_SEEK_SET);
+ DLIF_fread(&(hash_nbucket), sizeof(Elf32_Word), 1, fd);
+ DLIF_fread(&(hash_nchain), sizeof(Elf32_Word), 1, fd);
+ if (dyn_module->wrong_endian)
+ {
+ DLIMP_change_endian32((int32_t*)(&(hash_nbucket)));
+ DLIMP_change_endian32((int32_t*)(&(hash_nchain)));
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* The number of entires in the dynamic symbol table is not encoded */
+ /* anywhere in the elf file. However, the nchain is guaranteed to be */
+ /* the same as the number of symbols. Use nchain to set the symnum. */
+ /*---------------------------------------------------------------------*/
+ dyn_module->symnum = hash_nchain;
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("symnum=%d\n", hash_nchain);
+#endif
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC, "Mandatory dynamic tag DT_HASH is not found!\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read dynamic symbol table. */
+ /*------------------------------------------------------------------------*/
+ if (symtab_offset)
+ {
+ int j = 0;
+ DLIF_fseek(fd, symtab_offset, LOADER_SEEK_SET);
+ dyn_module->symtab =
+ DLIF_malloc(dyn_module->symnum * sizeof(struct Elf32_Sym));
+ DLIF_fread(dyn_module->symtab, sizeof(struct Elf32_Sym),
+ dyn_module->symnum, fd);
+ if (dyn_module->wrong_endian)
+ {
+ for (j = 0; j < dyn_module->symnum; j++)
+ DLIMP_change_sym_endian(dyn_module->symtab + j);
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* The st_name field of an Elf32_Sym entity is an offset into the */
+ /* string table. Convert it into a pointer to the string. */
+ /*---------------------------------------------------------------------*/
+ if (strtab_offset)
+ for (j = 0; j < dyn_module->symnum; j++)
+ dyn_module->symtab[j].st_name += (Elf32_Word) dyn_module->strtab;
+ }
+ else
+ {
+ DLIF_warning(DLWT_MISC,
+ "Mandatory dynamic tag DT_SYMTAB is not found!\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the SONAME. */
+ /*------------------------------------------------------------------------*/
+ if (!soname_found)
+ {
+ DLIF_warning(DLWT_MISC, "Dynamic tag DT_SONAME is not found!\n");
+ dyn_module->name = DLIF_malloc(sizeof(char));
+ *dyn_module->name = '\0';
+ }
+ else
+ {
+ dyn_module->name =
+ DLIF_malloc(strlen(dyn_module->strtab + soname_offset) + 1);
+ strcpy(dyn_module->name, dyn_module->strtab + soname_offset);
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("Name of dynamic object: %s\n", dyn_module->name);
+#endif
+ }
+
+ return TRUE;
+}
+
+
+/*****************************************************************************/
+/* dload_dynamic_information() */
+/* */
+/* Given a dynamic module with a dynamic segment which is located via */
+/* given dynamic segment index, make a local copy of the dynamic table */
+/* in the dynamic module object, then process the dynamic tag entries in */
+/* the table. */
+/* */
+/*****************************************************************************/
+static BOOL dload_dynamic_information(LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Word dyn_seg_idx)
+{
+ /*------------------------------------------------------------------------*/
+ /* Read a copy of the dynamic table into the dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ copy_dynamic_table(fd, dyn_module, dyn_seg_idx);
+
+ /*------------------------------------------------------------------------*/
+ /* Process dynamic entries in the dynamic table. If any problems are */
+ /* encountered, the loader should emit an error or warning and return */
+ /* FALSE here. */
+ /*------------------------------------------------------------------------*/
+ return process_dynamic_table(fd, dyn_module);
+}
+
+/*****************************************************************************/
+/* check_circular_dependency() */
+/* */
+/* Determine whether a dynamic module is already in the process of being */
+/* loaded before we try to start loading it again. If it is already */
+/* being loaded, then the dynamic loader has detected a circular */
+/* dependency. An error will be emitted and the load will be aborted. */
+/* */
+/*****************************************************************************/
+static BOOL check_circular_dependency(DLOAD_HANDLE handle,
+ const char *dyn_mod_name)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check the name of the given dependency module to be loaded against the */
+ /* list of modules that are currently in the process of being loaded. */
+ /* Report an error if any circular dependencies are detected. */
+ /*------------------------------------------------------------------------*/
+ int i;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (i = 0; i < pHandle->DLIMP_module_dependency_list.size; i++)
+ if (!strcmp(dyn_mod_name,
+ ((char**)(pHandle->DLIMP_module_dependency_list.buf))[i]))
+ {
+ DLIF_error(DLET_MISC,
+ "Circular dependency detected, '%s' is already in the "
+ "process of loading.\n", dyn_mod_name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* dload_dynamic_segment() */
+/* */
+/* Find the dynamic segment in the given ELF module, if there is one. */
+/* If there is a dynamic segment, then make a local copy of the dynamic */
+/* table in the dynamic module object provided, then process the dynamic */
+/* tag entries in the table. */
+/* */
+/* If there is no dynamic segment, then we return success from this */
+/* function, marking the dynamic module as "not relocatable". */
+/* */
+/*****************************************************************************/
+static BOOL dload_dynamic_segment(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* If we don't find dynamic segment, the relocatable flag will have been */
+ /* set to false to indicate that the module is a static executable. We */
+ /* still return TRUE from this function so that we can proceed with */
+ /* static loading. */
+ /*------------------------------------------------------------------------*/
+ Elf32_Word dyn_seg_idx = 0;
+ if (!find_dynamic_segment(dyn_module, &dyn_seg_idx))
+ return TRUE;
+
+ /*------------------------------------------------------------------------*/
+ /* Process the OSABI now, after we know if the module is relocatable. */
+ /*------------------------------------------------------------------------*/
+ if (!process_eiosabi(dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Unsupported EI_OSABI value.\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read the dynamic table from the ELF file, then process the dynamic */
+ /* tags in the table. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_information(fd, dyn_module, dyn_seg_idx))
+ return FALSE;
+
+ /*------------------------------------------------------------------------*/
+ /* Check to make sure that this module is not already being loaded. If */
+ /* is, then it will cause a circular dependency to be introduced. */
+ /* Loader should detect circular dependencies and emit an error. */
+ /*------------------------------------------------------------------------*/
+ if (!check_circular_dependency(handle, dyn_module->name))
+ return FALSE;
+
+ return TRUE;
+}
+
+/*****************************************************************************/
+/* COPY_SEGMENTS() - */
+/* */
+/* Copy all segments into host memory. */
+/*****************************************************************************/
+static void copy_segments(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ int s, seg_size = dyn_module->loaded_module->loaded_segments.size;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+
+ for (s=0; s<seg_size; s++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+ targ_req.fp = fp;
+ targ_req.segment = seg[s].obj_desc;
+ targ_req.offset = seg[s].phdr.p_offset;
+ targ_req.flags = DLOAD_SF_relocatable;
+
+ if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = seg[s].phdr.p_align;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy segment data from the file into host buffer where it can */
+ /* be relocated. */
+ /*---------------------------------------------------------------------*/
+ DLIF_copy(pHandle->client_handle, &targ_req);
+ seg[s].host_address = targ_req.host_address;
+ }
+}
+
+/*****************************************************************************/
+/* WRITE_SEGMENTS() - */
+/* */
+/* Write all segments to target memory. */
+/*****************************************************************************/
+static void write_segments(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC* fp,
+ DLIMP_Dynamic_Module* dyn_module)
+{
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(dyn_module->loaded_module->loaded_segments.buf);
+ int s, seg_size = dyn_module->loaded_module->loaded_segments.size;
+
+ for (s=0; s<seg_size; s++)
+ {
+ struct DLOAD_MEMORY_REQUEST targ_req;
+
+ targ_req.fp = fp;
+ targ_req.segment = seg[s].obj_desc;
+ targ_req.offset = seg[s].phdr.p_offset;
+ targ_req.flags = DLOAD_SF_relocatable;
+
+ if (seg[s].phdr.p_flags & PF_X) targ_req.flags |= DLOAD_SF_executable;
+ if (seg[s].phdr.p_flags & PF_W) targ_req.flags |= DLOAD_SF_writable;
+
+ targ_req.align = seg[s].phdr.p_align;
+ targ_req.host_address = seg[s].host_address;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy segment data from the file into host buffer where it can */
+ /* be relocated. */
+ /*---------------------------------------------------------------------*/
+ DLIF_write(pHandle->client_handle, &targ_req);
+ }
+}
+
+/*****************************************************************************/
+/* SEG_HAS_SPACE_FOR_WRITE() - */
+/* */
+/* Check if segment has enough space to recieve contents of .args section. */
+/*****************************************************************************/
+static BOOL seg_has_space_for_write(DLIMP_Loaded_Module* lmodule, int sz)
+{
+ DLIMP_Loaded_Segment* seg =
+ (DLIMP_Loaded_Segment*)(lmodule->loaded_segments.buf);
+ int s, seg_size = lmodule->loaded_segments.size;
+
+ Elf32_Addr write_address = (Elf32_Addr)lmodule->c_args;
+
+ for (s=0; s<seg_size; s++)
+ {
+ Elf32_Addr seg_boundary =
+ seg[s].phdr.p_vaddr + seg[s].obj_desc->memsz_in_bytes;
+
+ /*---------------------------------------------------------------------*/
+ /* If address to write to is greater than segment addr and less than */
+ /* segment end, it must lie in current segment. */
+ /*---------------------------------------------------------------------*/
+ if ((write_address >= seg[s].phdr.p_vaddr) &&
+ (write_address < seg_boundary))
+ {
+ if ((write_address + sz) > seg_boundary)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ {
+ DLIF_trace("Write requires 0x%x bytes\n",write_address + sz);
+ DLIF_trace("Seg boundary at : 0x%x\n",seg_boundary);
+ DLIF_trace("WARNING - Not enough space in segment\n");
+ }
+#endif
+ return FALSE;
+ }
+ else return TRUE;
+ }
+ }
+ /*------------------------------------------------------------------------*/
+ /* Given address doesn't belong to any known segment. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+
+/*****************************************************************************/
+/* DLOAD_initialize() */
+/* */
+/* Construct and initialize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* This function is deprecated, replaced by DLOAD_create(). */
+/* */
+/*****************************************************************************/
+void DLOAD_initialize(DLOAD_HANDLE handle)
+{
+}
+
+/*****************************************************************************/
+/* DLOAD_finalize() */
+/* */
+/* Destroy and finalize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+/* */
+/* This function is deprecated, replaced by DLOAD_destroy(). */
+/* */
+/*****************************************************************************/
+void DLOAD_finalize(DLOAD_HANDLE handle)
+{
+}
+
+/*****************************************************************************/
+/* dload_static_executable() */
+/* */
+/* Account for target memory allocated to static executable and wrap up */
+/* loading. No relocation is necessary. */
+/* */
+/*****************************************************************************/
+static int32_t dload_static_executable(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ int32_t local_file_handle = 0;
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Starting dload_static_executable() ...\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Set entry point for static executable and attempt to allocate target */
+ /* memory for the static executable. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->loaded_module->entry_point = dyn_module->fhdr.e_entry;
+ if (load_static_segment(handle, fd, dyn_module) &&
+ load_object(fd, dyn_module))
+ {
+ /*---------------------------------------------------------------------*/
+ /* If successful, we'll want to detach the loaded module object from */
+ /* the dynamic module object that created it. Take note of the file */
+ /* handle. */
+ /*---------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *loaded_module = detach_loaded_module(dyn_module);
+ local_file_handle = loaded_module->file_handle;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Static load failed. Flag an error. */
+ /*------------------------------------------------------------------------*/
+ else
+ DLIF_error(DLET_MEMORY,
+ "Failed to allocate target memory for static executable.\n");
+
+ /*------------------------------------------------------------------------*/
+ /* Destruct dynamic module object. */
+ /*------------------------------------------------------------------------*/
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Finished dload_static_executable()\n");
+#endif
+
+ return local_file_handle;
+}
+
+#if LOADER_DEBUG || LOADER_PROFILE
+int DLREL_relocations;
+time_t DLREL_total_reloc_time;
+#endif
+
+/*****************************************************************************/
+/* process_dynamic_module_relocations() */
+/* */
+/* Make a host-accessible copy of all of the segments, process all */
+/* relocation entries associated with the given module within that */
+/* space, then write the updated segment buffers back out to target */
+/* memory. */
+/* */
+/*****************************************************************************/
+static void process_dynamic_module_relocations(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+#if LOADER_DEBUG || LOADER_PROFILE
+ if(debugging_on || profiling_on)
+ {
+ DLIF_trace("Running relocate()...\n");
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Copy segments from file to host memory */
+ /*------------------------------------------------------------------------*/
+ copy_segments(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Process dynamic relocations. */
+ /*------------------------------------------------------------------------*/
+ DLREL_relocate(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Write segments from host memory to target memory */
+ /*------------------------------------------------------------------------*/
+ write_segments(handle, fd, dyn_module);
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Report timing and progress information for relocation step. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long) profile_cycle_count());
+ DLIF_trace("Total reloc time: %lu\n",
+ (unsigned long) DLREL_total_reloc_time);
+ DLIF_trace("Time per relocation: %ld\n",
+ DLREL_relocations ? DLREL_total_reloc_time / DLREL_relocations : 0);
+ }
+
+ DLIF_trace("Number of relocations: %d\n", DLREL_relocations);
+ DLIF_trace("\nAbout to run load_object()...");
+ DLREL_total_reloc_time = DLREL_relocations = 0;
+ if (profiling_on) profile_start_clock();
+ }
+#endif
+
+}
+
+/*****************************************************************************/
+/* store_preinit_data() */
+/* */
+/* Given a dynamic module object, store pre-initialization function */
+/* information. The user may also provide a custom iniitialization */
+/* function that needs to be executed before the compiler */
+/* generated static initialization functions are executed. */
+/* The dynamic loader will now create a table TI_init_table to store */
+/* pre-init and init data. This is done because pre-init and */
+/* init functions could reference as-yet unrelocated symbols from other */
+/* modules. As such it is safer to store relevant function addresses and */
+/* execute them only after all modules are relocated (CQ34088). */
+/* */
+/*****************************************************************************/
+static void store_preinit_data(DLIMP_Dynamic_Module *dyn_module)
+{
+ IF_single_record *preinit_rec = NULL;
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of DT_PREINIT_ARRAY and DT_PREINIT_ARRAYSZ */
+ /* dynamic tags associated with this module. The dyn_module object will */
+ /* hold the relevant indices into the local copy of the dynamic table. */
+ /* The value of the DT_INIT_ARRAY tag will have been updated after */
+ /* placement of the module was completed. Arrays of size 0 will be */
+ /* ignored (CQ36935). */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->preinit_arraysz > 0)
+ {
+ preinit_rec = (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the .preinit_array section from the value */
+ /* of the DT_PREINIT_ARRAY tag, and store it in the TI_init_table. */
+ /*---------------------------------------------------------------------*/
+ preinit_rec->size = dyn_module->preinit_arraysz;
+ preinit_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->preinit_array_idx].d_un.d_ptr);
+ }
+
+ if (preinit_rec) IF_table_enqueue(&TI_init_table, preinit_rec);
+}
+
+/*****************************************************************************/
+/* store_init_data() */
+/* */
+/* Given a dynamic module object, save off initialization function(s) for */
+/* all global and static data objects that are defined in the module */
+/* which require construction. The dynamic loader will now create a table */
+/* TI_init_table to store pre-init and init data. This is done because */
+/* pre-init and init functions could reference as-yet unrelocated symbols */
+/* from other modules. As such it is safer to store relevant function */
+/* addresses and execute them only after all modules are relocated. */
+/* */
+/*****************************************************************************/
+static void store_init_data(DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of a DT_INIT dynamic tag associated with this */
+ /* module. The dynamic module will hold the index into the local copy of */
+ /* the dynamic table. This entry in the dynamic table will have been */
+ /* updated after placement of the module is completed. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->init_idx != -1)
+ {
+ IF_single_record *init_rec =
+ (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the initialization function from the value */
+ /* of the DT_INIT tag, and get the client to execute the function. */
+ /*---------------------------------------------------------------------*/
+ init_rec->size = 0;
+ init_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->init_idx].d_un.d_ptr);
+
+ IF_table_enqueue(&TI_init_table, init_rec);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Check for presence of a DT_INIT_ARRAY and DT_INIT_ARRAYSZ dynamic tags */
+ /* associated with this module. The dyn_module object will hold the */
+ /* relevant indices into the local copy of the dynamic table. The value */
+ /* of the DT_INIT_ARRAY tag will have been updated after placement of the */
+ /* module was completed. Arraysz must be a postive number > 0, else it */
+ /* be ignored (CQ36935). */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->init_arraysz > 0)
+ {
+ IF_single_record *arr_rec =
+ (IF_single_record *)DLIF_malloc(sizeof(IF_single_record));
+ /*---------------------------------------------------------------------*/
+ /* Retrieve the address of the .init_array section from the value of */
+ /* DT_INIT_ARRAY tag. */
+ /*---------------------------------------------------------------------*/
+ arr_rec->size = dyn_module->init_arraysz;
+ arr_rec->sect_addr = (TARGET_ADDRESS)
+ (dyn_module->dyntab[dyn_module->init_array_idx].d_un.d_ptr);
+
+ IF_table_enqueue(&TI_init_table, arr_rec);
+ }
+}
+
+/*****************************************************************************/
+/* execute_module_initialization() */
+/* */
+/* Given a dynamic module object, execute pre-initialization and */
+/* initialization function(s) for all global and static data objects that */
+/* are defined in the module which require construction. The user may */
+/* also provide a custom iniitialization function that needs to be */
+/* executed before the compiler generated static initialization functions */
+/* are executed. */
+/* Note that the functions to be executed have already been saved off in */
+/* the TI_init_table, by store_preinit_data() and store_init_data(). */
+/* */
+/*****************************************************************************/
+static void execute_module_initialization(DLOAD_HANDLE handle)
+{
+ IF_single_record *val = NULL;
+ IF_table_Queue_Node *curr_ptr = TI_init_table.front_ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (; curr_ptr; curr_ptr = curr_ptr->next_ptr)
+ {
+ val = curr_ptr->value;
+
+ /*---------------------------------------------------------------------*/
+ /* A size of 0 indicates DT_INIT, otherwise this is an ARRAY. */
+ /*---------------------------------------------------------------------*/
+ if (val->size != 0)
+ {
+ /*------------------------------------------------------------------*/
+ /* Now make a loader-accessible copy of the .init_array section. */
+ /*------------------------------------------------------------------*/
+ int32_t i;
+ int32_t num_init_fcns = val->size/sizeof(TARGET_ADDRESS);
+ TARGET_ADDRESS *init_array_buf = (TARGET_ADDRESS *)
+ DLIF_malloc(val->size);
+
+ DLIF_read(pHandle->client_handle,
+ init_array_buf, 1, val->size,
+ (TARGET_ADDRESS)val->sect_addr);
+
+ /*------------------------------------------------------------------*/
+ /* Call each function whose address occupies an entry in array in */
+ /* the order that they appear in the array. The size of the array is*/
+ /* provided by the init_arraysz field in the dynamic module (copied */
+ /* earlier when the dynamic table was read in). Make sure that */
+ /* function addresses are valid before execution. */
+ /*------------------------------------------------------------------*/
+ for (i = 0; i < num_init_fcns; i++)
+ if (init_array_buf[i])
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(init_array_buf[i]));
+ else
+ DLIF_warning(DLWT_MISC,
+ "DT_INIT_ARRAY/DT_PREINIT_ARRAY function address is NULL!");
+
+ DLIF_free(init_array_buf);
+ }
+ else
+ {
+ if (val->sect_addr)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(val->sect_addr));
+ else
+ DLIF_warning(DLWT_MISC, "DT_INIT function address is NULL!");
+ }
+ }
+}
+
+/*****************************************************************************/
+/* adjust_module_init_fini() */
+/* If the dynamic loader need not process the module initialization */
+/* and termination (fini section) then adjust the module info so that */
+/* the respective sizes become zero. */
+/*****************************************************************************/
+static void adjust_module_init_fini(DLIMP_Dynamic_Module *dm)
+{
+ /*------------------------------------------------------------------------*/
+ /* The C6x RTS boot code has the function _c_int00 which performs */
+ /* the C/C++ initialization. This function processes the .init_array */
+ /* to perform the C/C++ initialization and handles termination through */
+ /* the at_exit functionality. If the dynamic executable we are loading */
+ /* includes _c_int00, the loader assumes that the application code takes */
+ /* care of all initialization and termination. Hence the loader won't */
+ /* perform the initialization and termination. */
+ /* NOTE: Use of __TI_STACK_SIZE is a hack. The _c_int00 symbol is not */
+ /* in the dynamic symbol table. The right fix is for the linker */
+ /* not to generate the init array tags if the build includes RTS */
+ /* boot routine. */
+ /*------------------------------------------------------------------------*/
+ if (dm->fhdr.e_type == ET_EXEC &&
+ DLSYM_lookup_local_symtab("__TI_STACK_SIZE", dm->symtab, dm->symnum,
+ NULL))
+ {
+ dm->init_arraysz = 0;
+ dm->init_array_idx = -1;
+
+ dm->preinit_arraysz = 0;
+ dm->preinit_array_idx = -1;
+
+ dm->loaded_module->fini_arraysz = 0;
+ dm->loaded_module->fini_array = (Elf32_Addr) NULL;
+ dm->loaded_module->fini = (Elf32_Addr) NULL;
+ }
+}
+
+/*****************************************************************************/
+/* relocate_dependency_graph_modules() */
+/* */
+/* For each dynamic module on the dependency stack, process dynamic */
+/* relocation entries then perform initialization for all global and */
+/* static objects that are defined in tha given module. The stack is */
+/* emptied from the top (LIFO). Each dynamic module object is popped */
+/* off the top of the stack, the module gets relocated, its global and */
+/* static objects that need to be constructed will be constructed, and */
+/* then, after detaching the loaded module object from its dynamic */
+/* module, the dynamic module object is destructed. */
+/* */
+/*****************************************************************************/
+static
+int32_t relocate_dependency_graph_modules(DLOAD_HANDLE handle,
+ LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Processing of relocations will only be triggered when this function */
+ /* is called from the top-level object module (at the bottom of the */
+ /* dependency graph stack). */
+ /*------------------------------------------------------------------------*/
+ int32_t local_file_handle = dyn_module->loaded_module->file_handle;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ dynamic_module_ptr_Stack_Node *ptr =
+ pHandle->DLIMP_dependency_stack.bottom_ptr;
+ if (ptr && (ptr->value != dyn_module)) return local_file_handle;
+
+ if (is_dsbt_module(dyn_module))
+ {
+ /*--------------------------------------------------------------------*/
+ /* Assign DSBT indices. */
+ /*--------------------------------------------------------------------*/
+ DLIF_assign_dsbt_indices();
+
+ /*--------------------------------------------------------------------*/
+ /* Update the content of all DSBTs for any module that uses the */
+ /* DSBT model. */
+ /*--------------------------------------------------------------------*/
+ DLIF_update_all_dsbts();
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Ok, we are ready to process relocations. The relocation tables */
+ /* associated with dependent files will be processed first. Consume */
+ /* dynamic module objects from the dependency graph stack from dependents */
+ /* to the root of the dependency graph. */
+ /*------------------------------------------------------------------------*/
+ while (pHandle->DLIMP_dependency_stack.size > 0)
+ {
+ DLIMP_Dynamic_Module *dyn_mod_ptr =
+ dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack);
+
+ /*---------------------------------------------------------------------*/
+ /* Process dynamic relocations associated with this module. */
+ /*---------------------------------------------------------------------*/
+ process_dynamic_module_relocations(handle, dyn_mod_ptr->fd, dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there */
+ /* is one. Record this pointer in the ELF file internal data object. */
+ /* Also store this in the loaded module, since this will be needed to */
+ /* write argv, argc to .args at execution time. */
+ /*---------------------------------------------------------------------*/
+ DLSYM_lookup_local_symtab("__c_args__", dyn_mod_ptr->symtab,
+ dyn_mod_ptr->symnum,
+ (Elf32_Addr *)&dyn_mod_ptr->c_args);
+ dyn_mod_ptr->loaded_module->c_args = dyn_mod_ptr->c_args;
+
+ /*---------------------------------------------------------------------*/
+ /* Pick up entry point address from ELF file header. */
+ /* We currently only support a single entry point into the ELF file. */
+ /* To support Braveheart notion of nodes, with multiple entry points,*/
+ /* we'll need to get the list of entry points associated with a node,*/
+ /* then add capability to the "execute" command to select the entry */
+ /* point that we want to start executing from. */
+ /*---------------------------------------------------------------------*/
+ dyn_mod_ptr->loaded_module->entry_point = dyn_mod_ptr->fhdr.e_entry;
+
+ /*---------------------------------------------------------------------*/
+ /* Copy command-line arguments into args section and deal with DSBT */
+ /* issues (copy DSBT to its run location). */
+ /* Note that below function is commented out because this doesn't do */
+ /* much as of now. */
+ /*---------------------------------------------------------------------*/
+ //load_object(dyn_mod_ptr->fd, dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* Perform initialization, if needed, for this module. */
+ /*---------------------------------------------------------------------*/
+ store_init_data(dyn_mod_ptr);
+
+ /*---------------------------------------------------------------------*/
+ /* Free all dependent file pointers. */
+ /*---------------------------------------------------------------------*/
+ if (dyn_mod_ptr->fd != fd)
+ {
+ DLIF_fclose(dyn_mod_ptr->fd);
+ dyn_mod_ptr->fd = NULL;
+ }
+
+ /*---------------------------------------------------------------------*/
+ /* Detach loaded module object from the dynamic module object that */
+ /* created it, then throw away the dynamic module object. */
+ /*---------------------------------------------------------------------*/
+ detach_loaded_module(dyn_mod_ptr);
+ delete_DLIMP_Dynamic_Module(handle, &dyn_mod_ptr);
+ }
+
+ return local_file_handle;
+}
+
+/*****************************************************************************/
+/* DLOAD_load() */
+/* */
+/* Dynamically load the specified file and return a file handle for the */
+/* loaded file. If the load fails, this function will return a value of */
+/* zero (0) for the file handle. */
+/* */
+/* The core loader must have read access to the file pointed to by fd. */
+/* */
+/*****************************************************************************/
+int32_t DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd)
+{
+ int32_t fl_handle;
+
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd);
+
+ if (!dyn_module)
+ return 0;
+
+#if LOADER_DEBUG
+ /*------------------------------------------------------------------------*/
+ /* Spit out some loader progress information when we begin loading an */
+ /* object. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on) DLIF_trace("Loading file...\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* If no access to a program was provided, there is nothing to do. */
+ /*------------------------------------------------------------------------*/
+ if (!fd)
+ {
+ DLIF_error(DLET_FILE, "Missing file specification.\n");
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Read file headers and dynamic information into dynamic module. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_headers(fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Find the dynamic segment, if there is one, and read dynamic */
+ /* information from the ELF object file into the dynamic module data */
+ /* structure associated with this file. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_segment(handle, fd, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Perform sanity checking on the read-in ELF file. */
+ /*------------------------------------------------------------------------*/
+ if (!is_valid_elf_object_file(fd, dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ return 0;
+ }
+
+#if LOADER_DEBUG || LOADER_PROFILE
+ /*------------------------------------------------------------------------*/
+ /* Stop clock on initialization of ELF file information. Start clock on */
+ /* initialization of ELF module. */
+ /*------------------------------------------------------------------------*/
+ if (debugging_on || profiling_on)
+ {
+ DLIF_trace("Finished dload_dynamic_segment.\n");
+ if (profiling_on)
+ {
+ profile_stop_clock();
+ DLIF_trace("Took %lu cycles.\n",
+ (unsigned long) profile_cycle_count());
+ }
+ }
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize internal ELF module and segment structures. Sets */
+ /* loaded_module in *dyn_module. This also deals with assigning a file */
+ /* handle and bumping file handle counter. */
+ /*------------------------------------------------------------------------*/
+ initialize_loaded_module(handle, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Append Module structure to loaded object list. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects,
+ dyn_module->loaded_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Support static loading as special case. */
+ /*------------------------------------------------------------------------*/
+ if (!dyn_module->relocatable)
+ return dload_static_executable(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Get space & address for segments, and offset symbols and program */
+ /* header table to reflect the relocated address. Also offset the */
+ /* addresses in the internal Segment structures used by the Module */
+ /* structure. Note that this step needs to be performed prior and in */
+ /* addition to the relocation entry processing. */
+ /*------------------------------------------------------------------------*/
+ if (!allocate_dynamic_segments_and_relocate_symbols(handle, fd, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* __c_args__ points to the beginning of the .args section, if there is */
+ /* one. __TI_STATIC_BASE points to the beginning of the DP-relative data */
+ /* segment (value to initialize DP). Record these addresses in the ELF */
+ /* file internal data object. */
+ /*------------------------------------------------------------------------*/
+ DLSYM_lookup_local_symtab("__c_args__", dyn_module->symtab,
+ dyn_module->symnum,
+ (Elf32_Addr *)&dyn_module->c_args);
+
+ DLSYM_lookup_local_symtab("__TI_STATIC_BASE", dyn_module->symtab,
+ dyn_module->symnum,
+ (Elf32_Addr *)&dyn_module->static_base);
+ dyn_module->loaded_module->static_base = dyn_module->static_base;
+
+ /*------------------------------------------------------------------------*/
+ /* If the user application performs initialization and termination, */
+ /* the dynamic loader shouldn't process the init/fini sections. */
+ /* Check and adjust the init/fini information accordingly. */
+ /*------------------------------------------------------------------------*/
+ adjust_module_init_fini(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Execute any user defined pre-initialization functions that may be */
+ /* associated with a dynamic executable module. */
+ /*------------------------------------------------------------------------*/
+ if (dyn_module->fhdr.e_type == ET_EXEC)
+ store_preinit_data(dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Append current ELF file to list of objects currently loading. */
+ /* This is used to detect circular dependencies while we are processing */
+ /* the dependents of this file. */
+ /*------------------------------------------------------------------------*/
+ AL_append(&pHandle->DLIMP_module_dependency_list, &dyn_module->name);
+
+ /*------------------------------------------------------------------------*/
+ /* Push this dynamic module object onto the dependency stack. */
+ /* All of the modules on the stack will get relocated after all of the */
+ /* dependent files have been loaded and allocated. */
+ /*------------------------------------------------------------------------*/
+ dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* If this object file uses the DSBT model, then register a DSBT index */
+ /* request with the client's DSBT support management. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module) &&
+ !DLIF_register_dsbt_index_request(handle,
+ dyn_module->name,
+ dyn_module->loaded_module->file_handle,
+ dyn_module->dsbt_index))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Load this ELF file's dependees (all files on its DT_NEEDED list). */
+ /* Dependees must be loaded and relocated before processing this module's */
+ /* relocations. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_and_allocate_dependencies(handle, dyn_module))
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Remove the current ELF file from the list of files that are in the */
+ /* process of loading. */
+ /*------------------------------------------------------------------------*/
+ pHandle->DLIMP_module_dependency_list.size--;
+
+ /*------------------------------------------------------------------------*/
+ /* Process relocation entries. */
+ /*------------------------------------------------------------------------*/
+ fl_handle = relocate_dependency_graph_modules(handle, fd, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* With initialization complete, and all relocations having been resolved */
+ /* do module initialization. */
+ /*------------------------------------------------------------------------*/
+ execute_module_initialization(handle);
+
+ return fl_handle;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_entry_names() */
+/* */
+/* Build a list of entry point names for a loaded object. Currently, */
+/* any global symbol in the module is considered a valid entry point */
+/* regardless of whether it is defined in code or associated with a */
+/* data object. We would need to process the content of the symbol */
+/* table entry or its debug information to determine whether it is a */
+/* valid entry point or not. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle,
+ uint32_t file_handle,
+ int32_t *entry_pt_cnt,
+ char ***entry_pt_names)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then build a list of entry points from that file's */
+ /* symbol table. */
+ /*------------------------------------------------------------------------*/
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ loaded_module_ptr_Queue_Node* ptr;
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ if (ptr->value->file_handle == file_handle)
+ {
+ DLIMP_Loaded_Module *module = ptr->value;
+ struct Elf32_Sym *symtab;
+ int i;
+
+ /*------------------------------------------------------------------*/
+ /* Any symbol in our file's symbol table is considered a valid */
+ /* entry point. */
+ /*------------------------------------------------------------------*/
+ symtab = (struct Elf32_Sym*)module->gsymtab;
+ *entry_pt_cnt = module->gsymnum;
+ *entry_pt_names = DLIF_malloc(*entry_pt_cnt * sizeof(char*));
+ for (i = 0; i < module->gsymnum; i++)
+ {
+ const char *sym_name = (const char *)symtab[i].st_name;
+ **entry_pt_names = DLIF_malloc(strlen(sym_name) + 1);
+ strcpy(**entry_pt_names,sym_name);
+ }
+
+ return TRUE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_prepare_for_execution() */
+/* */
+/* Given a file handle, prepare for execution : */
+/* - Return entry point associated with that module in the *sym_val */
+/* output parameter. */
+/* - Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* - As a test (for the Reference implementation) read the arguments */
+/* using the DLIF_read_arguments() function and set global argc,argv. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val,
+ int argc, char** argv)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *ep_loaded_module;
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point);
+ ep_loaded_module = ptr->value;
+
+ /*------------------------------------------------------------------*/
+ /* Write argc, argv to the .args section in this module. */
+ /*------------------------------------------------------------------*/
+ if (!write_arguments_to_args_section(handle, argc, argv,
+ ep_loaded_module))
+ {
+ DLIF_error(DLET_MISC, "Couldn't write to .args section\n");
+ return FALSE;
+ }
+
+ /*------------------------------------------------------------------*/
+ /* For the Reference Implementation we simulate a "boot" (rts boot */
+ /* routine reads argc, argv from .args), by reading argc, argv from */
+ /* .args section. Note that we just wrote these values to the .args */
+ /* so this read serves as a test for the Reference Implementation. */
+ /*------------------------------------------------------------------*/
+ read_args_from_section(ep_loaded_module);
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_load_arguments() */
+/* */
+/* Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle,
+ int argc, char** argv)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ DLIMP_Loaded_Module *ep_loaded_module;
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ ep_loaded_module = ptr->value;
+
+ /*------------------------------------------------------------------*/
+ /* Write argc, argv to the .args section in this module. */
+ /*------------------------------------------------------------------*/
+ if (!write_arguments_to_args_section(handle, argc, argv,
+ ep_loaded_module))
+ {
+ DLIF_error(DLET_MISC, "Couldn't write to .args section\n");
+ return FALSE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_entry_point() */
+/* */
+/* Given a file handle, return the entry point associated with that */
+/* module in the *sym_val output parameter. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the entry point address associated with */
+ /* that module. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ if (ptr->value->file_handle == file_handle)
+ {
+ *sym_val = (TARGET_ADDRESS)(ptr->value->entry_point);
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the file we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_query_symbol() */
+/* */
+/* Query the value of a global symbol from a specific file. The value */
+/* result will be written to *sym_val. The function returns TRUE if the */
+/* symbol was found, and FALSE if it wasn't. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_query_symbol(DLOAD_HANDLE handle,
+ uint32_t file_handle,
+ const char *sym_name,
+ TARGET_ADDRESS *sym_val)
+{
+ /*------------------------------------------------------------------------*/
+ /* Spin through list of loaded files until we find the file handle we */
+ /* are looking for. Then return the value (target address) associated */
+ /* with the symbol we are looking for in that file. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ if (ptr->value->file_handle == file_handle)
+ {
+ DLIMP_Loaded_Module *module = ptr->value;
+ struct Elf32_Sym *symtab;
+ int i;
+
+ /*------------------------------------------------------------------*/
+ /* Search through the symbol table by name. */
+ /*------------------------------------------------------------------*/
+ symtab = (struct Elf32_Sym*)module->gsymtab;
+ for(i=0; i < module->gsymnum; i++)
+ {
+ if (!strcmp(sym_name, (const char *)symtab[i].st_name))
+ {
+ *sym_val = (TARGET_ADDRESS) symtab[i].st_value;
+ return TRUE;
+ }
+ }
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find the symbol we were looking for, return false. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+
+
+/*****************************************************************************/
+/* unlink_loaded_module() */
+/* */
+/* Unlink a loaded module data object from the list of loaded objects, */
+/* returning a pointer to the object so that it can be deconstructed. */
+/* */
+/*****************************************************************************/
+static DLIMP_Loaded_Module *unlink_loaded_module(DLOAD_HANDLE handle,
+ loaded_module_ptr_Queue_Node *back_ptr,
+ loaded_module_ptr_Queue_Node *lm_node)
+{
+ DLIMP_Loaded_Module *loaded_module = lm_node->value;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+ loaded_module_ptr_remove(&pHandle->DLIMP_loaded_objects, lm_node->value);
+ return loaded_module;
+}
+
+/*****************************************************************************/
+/* execute_module_termination() */
+/* */
+/* Execute termination functions associated with this loaded module. */
+/* Termination functions are called in the reverse order as their */
+/* corresponding initialization functions. */
+/* */
+/*****************************************************************************/
+static void execute_module_termination(DLOAD_HANDLE handle,
+ DLIMP_Loaded_Module *loaded_module)
+{
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* If a DT_FINI_ARRAY dynamic tag was encountered for this module, spin */
+ /* through the array in reverse order, calling each function address */
+ /* stored in the array. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->fini_arraysz != 0)
+ {
+ /*---------------------------------------------------------------------*/
+ /* Now make a loader-accessible copy of the .fini_array section. */
+ /*---------------------------------------------------------------------*/
+ int32_t i;
+ int32_t num_fini_fcns =
+ loaded_module->fini_arraysz/sizeof(TARGET_ADDRESS);
+ TARGET_ADDRESS *fini_array_buf = (TARGET_ADDRESS *)
+ DLIF_malloc(loaded_module->fini_arraysz);
+
+ DLIF_read(pHandle->client_handle,
+ fini_array_buf, 1, loaded_module->fini_arraysz,
+ (TARGET_ADDRESS)loaded_module->fini_array);
+
+ /*---------------------------------------------------------------------*/
+ /* Now spin through the array in reverse order, executing each */
+ /* termination function whose address occupies an entry in the array. */
+ /*---------------------------------------------------------------------*/
+ for (i = num_fini_fcns - 1; i >= 0; i--)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)(fini_array_buf[i]));
+
+ DLIF_free(fini_array_buf);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If a DT_FINI dynamic tag was encountered for this module, call the */
+ /* function indicated by the tag's value to complete the termination */
+ /* process for this module. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->fini != (Elf32_Addr) NULL)
+ DLIF_execute(pHandle->client_handle,
+ (TARGET_ADDRESS)loaded_module->fini);
+}
+
+/*****************************************************************************/
+/* remove_loaded_module() */
+/* */
+/* Find and unlink a loaded module data object from the list of loaded */
+/* objects, then call its destructor to free the host memory associated */
+/* with the loaded module and all of its loaded segments. */
+/* */
+/*****************************************************************************/
+static void remove_loaded_module(DLOAD_HANDLE handle,
+ loaded_module_ptr_Queue_Node *lm_node)
+{
+ DLIMP_Loaded_Module *lm_object = NULL;
+ loaded_module_ptr_Queue_Node *back_ptr = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ if (lm_node != pHandle->DLIMP_loaded_objects.front_ptr)
+ for (back_ptr = pHandle->DLIMP_loaded_objects.front_ptr;
+ back_ptr->next_ptr != lm_node;
+ back_ptr = back_ptr->next_ptr);
+
+ lm_object = unlink_loaded_module(handle, back_ptr, lm_node);
+
+ delete_DLIMP_Loaded_Module(handle, &lm_object);
+}
+
+/*****************************************************************************/
+/* DLOAD_unload() */
+/* */
+/* Unload specified module (identified by its file handle) from target */
+/* memory. Free up any target memory that was allocated for the module's */
+/* segments and also any host heap memory that was allocated for the */
+/* internal module and segment data structures. */
+/* */
+/* Return TRUE if program entry is actually destroyed. This is a way of */
+/* communicating to the client when it needs to actually remove debug */
+/* information associated with this module (so that client does not have */
+/* to maintain a use count that mirrors the program entry). */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t file_handle)
+{
+ loaded_module_ptr_Queue_Node* lm_node;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (lm_node = pHandle->DLIMP_loaded_objects.front_ptr; lm_node != NULL;
+ lm_node = lm_node->next_ptr)
+ {
+ if (lm_node->value->file_handle == file_handle)
+ {
+ --lm_node->value->use_count;
+ if (lm_node->value->use_count == 0)
+ {
+ DLIMP_Loaded_Module *loaded_module =
+ (DLIMP_Loaded_Module *)lm_node->value;
+ int j;
+ int *dep_file_handles;
+
+ /*---------------------------------------------------------------*/
+ /* Termination functions need to be executed in the reverse */
+ /* order as the corresponding initialization functions, so */
+ /* before we go unload this module's dependents, we need to */
+ /* perform the user/global/static termination functions */
+ /* associated with this module. */
+ /*---------------------------------------------------------------*/
+ execute_module_termination(handle, loaded_module);
+
+ /*---------------------------------------------------------------*/
+ /* Unload dependent modules via the client. Client needs to know */
+ /* when a dependent gets unloaded so that it can update debug */
+ /* information. */
+ /*---------------------------------------------------------------*/
+ dep_file_handles = (int*)(loaded_module->dependencies.buf);
+ for (j = 0; j < loaded_module->dependencies.size; j++)
+ DLIF_unload_dependent(pHandle->client_handle,
+ dep_file_handles[j]);
+
+ /*---------------------------------------------------------------*/
+ /* Find the predecessor node of the value we're deleting, */
+ /* because its next_ptr will need to be updated. */
+ /* */
+ /* We can't keep a back pointer around because */
+ /* DLIF_unload_dependent() might free that node, making our */
+ /* pointer invalid. Turn the Queue template into a doubly */
+ /* linked list if this overhead becomes a problem. */
+ /*---------------------------------------------------------------*/
+ remove_loaded_module(handle, lm_node);
+
+ /*---------------------------------------------------------------*/
+ /* Once unloading is done, reset virtual target to NULL. */
+ /*---------------------------------------------------------------*/
+ cur_target = NULL;
+
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_load_symbols() */
+/* */
+/* Load the symbols from the given file and make symbols available for */
+/* global symbol linkage. */
+/* */
+/*****************************************************************************/
+int32_t DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd)
+{
+ DLIMP_Dynamic_Module *dyn_module = new_DLIMP_Dynamic_Module(fd);
+ DLIMP_Loaded_Module *loaded_module = NULL;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ /*------------------------------------------------------------------------*/
+ /* Ensure we have a valid dynamic module object from the constructor. */
+ /*------------------------------------------------------------------------*/
+ if (!dyn_module)
+ return 0;
+
+ /*------------------------------------------------------------------------*/
+ /* If no access to a program was provided, there is nothing to do. */
+ /*------------------------------------------------------------------------*/
+ if (!fd)
+ {
+ DLIF_error(DLET_FILE, "Missing file specification.\n");
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Record argc and argv pointers with the dynamic module record. */
+ /*------------------------------------------------------------------------*/
+ dyn_module->argc = 0;
+ dyn_module->argv = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Read file headers and dynamic information into dynamic module. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_headers(fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Find the dynamic segment, if there is one, and read dynamic */
+ /* information from the ELF object file into the dynamic module data */
+ /* structure associated with this file. */
+ /*------------------------------------------------------------------------*/
+ if (!dload_dynamic_segment(handle, fd, dyn_module))
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Perform sanity checking on the read-in ELF file. */
+ /*------------------------------------------------------------------------*/
+ if (!is_valid_elf_object_file(fd, dyn_module))
+ {
+ DLIF_error(DLET_FILE, "Attempt to load invalid ELF file, '%s'.\n",
+ dyn_module->name);
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Initialize internal ELF module and segment structures. Sets */
+ /* loaded_module in *dyn_module. This also deals with assigning a file */
+ /* handle and bumping file handle counter. */
+ /*------------------------------------------------------------------------*/
+ initialize_loaded_module(handle, dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Add this module to the loaded module queue. */
+ /* Detach the loaded module object from the dynamic module thath created */
+ /* it. Ownership of the host memory allocated for the loaded module */
+ /* object now belongs to the DLIMP_loaded_objects list. */
+ /*------------------------------------------------------------------------*/
+ loaded_module_ptr_enqueue(&pHandle->DLIMP_loaded_objects,
+ dyn_module->loaded_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Register a DSBT index request for this module and update its own copy */
+ /* of the DSBT with the contents of the client's master DSBT. */
+ /*------------------------------------------------------------------------*/
+ if (is_dsbt_module(dyn_module))
+ {
+ dynamic_module_ptr_push(&pHandle->DLIMP_dependency_stack, dyn_module);
+ DLIF_register_dsbt_index_request(handle,
+ dyn_module->name,
+ dyn_module->loaded_module->file_handle,
+ dyn_module->dsbt_index);
+ DLIF_assign_dsbt_indices();
+ DLIF_update_all_dsbts();
+ dynamic_module_ptr_pop(&pHandle->DLIMP_dependency_stack);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Ownership of the host memory allocated for the loaded module object is */
+ /* transferred to the DLIMP_loaded_objects list. Free up the host memory */
+ /* for the dynamic module that created the loaded module object. Just */
+ /* call the destructor function for DLIMP_Dynamic_Module. */
+ /*------------------------------------------------------------------------*/
+ loaded_module = detach_loaded_module(dyn_module);
+ if(loaded_module == NULL)
+ {
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+ return 0;
+ }
+ delete_DLIMP_Dynamic_Module(handle, &dyn_module);
+
+ /*------------------------------------------------------------------------*/
+ /* Return a file handle so that the client can match this file to an ID. */
+ /*------------------------------------------------------------------------*/
+ return loaded_module->file_handle;
+}
+
+/*****************************************************************************/
+/* DSBT Support Functions */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* DLOAD_get_dsbt_size() */
+/* */
+/* Find the amount of space allocated for the specified module's DSBT. */
+/* It must be big enough to hold a copy of the master DSBT or the client */
+/* will flag an error. Those modules whose DSBT size is zero are assumed */
+/* to not be using the DSBT model. */
+/* */
+/*****************************************************************************/
+uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle)
+{
+ dynamic_module_ptr_Stack_Node *ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Dynamic_Module *dmp = ptr->value;
+ if (dmp->loaded_module->file_handle == file_handle)
+ return dmp->dsbt_size;
+ }
+
+ return 0;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_static_base() */
+/* */
+/* Look up static base symbol associated with the specified module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *static_base)
+{
+ loaded_module_ptr_Queue_Node* ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Loaded_Module *lmp = ptr->value;
+ if (lmp->file_handle == file_handle)
+ {
+ *static_base = (TARGET_ADDRESS)lmp->static_base;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLOAD_get_dsbt_base() */
+/* */
+/* Look up address of DSBT for the specified module. */
+/* */
+/*****************************************************************************/
+BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle, TARGET_ADDRESS *dsbt_base)
+{
+ dynamic_module_ptr_Stack_Node *ptr;
+ LOADER_OBJECT *pHandle = (LOADER_OBJECT *)handle;
+
+ for (ptr = pHandle->DLIMP_dependency_stack.top_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIMP_Dynamic_Module *dmp = ptr->value;
+ if (dmp->loaded_module->file_handle == file_handle)
+ {
+ *dsbt_base =
+ (TARGET_ADDRESS)dmp->dyntab[dmp->dsbt_base_tagidx].d_un.d_ptr;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* RELOCATE() - Perform RELA and REL type relocations for given ELF object */
+/* file that we are in the process of loading and relocating. */
+/*****************************************************************************/
+void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC* elf_file,
+ DLIMP_Dynamic_Module* dyn_module)
+
+{
+ cur_target->relocate(handle, elf_file, dyn_module);
+}
+
+/*****************************************************************************/
+/* GET_VT_OBJ() - Once file headers have been read, use the e_machine id to */
+/* figure out the virtul target, so we can access trg specific funcs. */
+/*****************************************************************************/
+static VIRTUAL_TARGET *get_vt_obj(int given_id)
+{
+ VIRTUAL_TARGET *ptr;
+
+ for(ptr = vt_arr; ptr->machine_id != EM_NONE ; ptr++)
+ if (ptr->machine_id == given_id) return ptr;
+
+ return NULL;
+}
+
+#if 0 && LOADER_DEBUG // enable to make available in debugger
+/*****************************************************************************/
+/* DEBUG_QUEUE() - Debug function. */
+/*****************************************************************************/
+static void debug_queue(LOADER_OBJECT *pHandle, char* position)
+{
+ loaded_module_ptr_Queue_Node* ptr;
+
+ if (!debugging_on) return;
+
+ DLIF_trace ("\nDEBUG QUEUE : %s, pHandle : 0x%x\n\n", position,
+ (uint32_t)pHandle);
+
+ for (ptr = pHandle->DLIMP_loaded_objects.front_ptr; ptr != NULL;
+ ptr = ptr->next_ptr)
+ {
+ DLIF_trace ("ptr->value->name : %s\n",ptr->value->name);
+ }
+ DLIF_trace ("\n");
+}
+#endif
+
+/*****************************************************************************/
+/* READ_ARGS_FROM_SECTION() - This function reads the argc, argv from the */
+/* .args section, and is used to test Reference implementation. */
+/*****************************************************************************/
+static void read_args_from_section(DLIMP_Loaded_Module* ep_module)
+{
+ /*------------------------------------------------------------------------*/
+ /* Before this function in called, the loader has gotten argv/argc from */
+ /* the module and written it out to the .args section. c_args points to */
+ /* the .args section. */
+ /*------------------------------------------------------------------------*/
+ ARGS_CONTAINER *pargs = (ARGS_CONTAINER *)(ep_module->c_args);
+ if (!pargs || pargs == (ARGS_CONTAINER *)0xFFFFFFFF)
+ {
+ global_argc = 0;
+ global_argv = NULL;
+ }
+ else
+ {
+ global_argc = pargs->argc;
+ global_argv = pargs->argv;
+ }
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/dload.h b/src/core/dsp/ocl_load/DLOAD/dload.h
new file mode 100644
index 0000000..bb7d427
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload.h
@@ -0,0 +1,334 @@
+/*
+* dload.h
+*
+* Define internal data structures used by core dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_H
+#define DLOAD_H
+
+#include "ArrayList.h"
+#include "Queue.h"
+#include "Stack.h"
+#include "elf32.h"
+#include "dload_api.h"
+#include "util.h"
+
+/*---------------------------------------------------------------------------*/
+/* Contains strings with names of files the loader is in process of loading. */
+/* This list is used to keep track of what objects are in the process of */
+/* loading while their dependents are being loaded so that we can detect */
+/* circular dependencies. */
+/*---------------------------------------------------------------------------*/
+extern Array_List DLIMP_module_dependency_list;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Loaded_Segment */
+/* */
+/* This structure represents a segment loaded on memory. */
+/* */
+/* This data structure should be created using host memory when a module */
+/* is being loaded into target memory. The data structure should persist */
+/* as long as the module stays resident in target memory. It should be */
+/* removed when the last use of the module is unloaded from the target. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ struct Elf32_Phdr phdr;
+ Elf32_Addr input_vaddr; /* original segment load addr */
+ BOOL modified;
+ struct DLOAD_MEMORY_SEGMENT *obj_desc;
+ void * host_address;
+} DLIMP_Loaded_Segment;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Loaded_Module */
+/* */
+/* This structure contains all the information the dynamic loader needs */
+/* to retain after loading an object file's segments into target memory. */
+/* The data structure is created while the object file is being loaded, */
+/* and should persist until the last use of the module is unloaded from */
+/* target memory. */
+/* */
+/* The information contained here is used by the dynamic loader to */
+/* perform dynamic symbol resolution, to track the use count, and to */
+/* finally deallocate the module's segments when the module is unloaded. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ char *name; /* Local copy of so_name */
+ int32_t file_handle;
+ int32_t use_count;
+ Elf32_Addr entry_point; /* Entry point address into module */
+ struct Elf32_Sym *gsymtab; /* Module's global symbol table */
+ Elf32_Word gsymnum; /* # global symbols */
+ char *gstrtab; /* Module's global symbol names */
+ Elf32_Word gstrsz; /* Size of global string table */
+ Array_List loaded_segments; /* List of DLIMP_Loaded_Segment(s) */
+ Array_List dependencies; /* List of dependent file handles */
+ BOOL direct_dependent_only;
+
+ Elf32_Addr fini; /* .fini function/section address */
+ Elf32_Addr fini_array; /* .fini_array term fcn ary addr */
+ int32_t fini_arraysz; /* sizeof .fini_array */
+ uint8_t *c_args; /* address of module's .args sect */
+ uint8_t *static_base; /* address of module's STATIC_BASE */
+
+} DLIMP_Loaded_Module;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_loaded_objects */
+/* */
+/* A list of loaded module objects (DLIMP_Loaded_Module *) that the */
+/* loader has placed into target memory. */
+/*---------------------------------------------------------------------------*/
+TYPE_QUEUE_DEFINITION(DLIMP_Loaded_Module*, loaded_module_ptr)
+extern loaded_module_ptr_Queue DLIMP_loaded_objects;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_Dynamic_Module */
+/* */
+/* This structure represents a dynamic module to be loaded by the dynamic */
+/* loader. It contains all the information necessary to load and relocate */
+/* the module. It actually contains most of the headers, dynamic info, */
+/* dynamic symbol table, string table etc. */
+/* */
+/* This structure is allocated in host memory while an ELF object file is */
+/* being loaded and will be destructed after the file has been */
+/* successfully loaded. To simplify loading and relocation of the object */
+/* file's segments, this data structure maintains a link to the loaded */
+/* module. This link is severed when the load is successfully completed. */
+/* The loaded module data structure will persist until the module is */
+/* actually unloaded from target memory, but this data structure will be */
+/* freed. */
+/* */
+/* If the load of the object file is not successful for any reason, then */
+/* the loaded module will not be detached from the dynamic module. In */
+/* such case, the destructor for the dynamic module will assume */
+/* responsibility for freeing any host memory associated with the loaded */
+/* module and its segments. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ char *name; /* Local copy of so_name */
+ LOADER_FILE_DESC *fd; /* Access to ELF object file */
+ struct Elf32_Ehdr fhdr; /* ELF Object File Header */
+ struct Elf32_Phdr *phdr; /* ELF Program Header Table */
+ Elf32_Word phnum; /* # entries in program header table */
+ char* strtab; /* String Table */
+ Elf32_Word strsz; /* String Table size in bytes */
+ struct Elf32_Dyn *dyntab; /* Elf Dynamic Table (.dynamic scn) */
+ /* This contains a list of dynamic */
+ /* tags which is terminated by a NULL */
+ /* record. */
+ struct Elf32_Sym *symtab; /* Elf Dynamic Symbol Table */
+ Elf32_Word symnum; /* # symbols in dynamic symbol table */
+ Elf32_Word gsymtab_offset;/* Offset into symbol table where */
+ /* global symbols start. */
+ Elf32_Word gstrtab_offset;/* Offset into string table where */
+ /* global symbol names start. */
+
+ uint8_t *c_args;
+ uint8_t *static_base; /* address of module's STATIC_BASE */
+ int32_t argc;
+ char **argv;
+ DLIMP_Loaded_Module *loaded_module;
+ int32_t wrong_endian;
+ BOOL direct_dependent_only;
+ BOOL relocatable; /* TRUE if module can be relocated */
+ /* at load-time. FALSE if module is */
+ /* a static executable. */
+ BOOL relocate_entry_point; /* TRUE if the entry point has */
+ /* not been relocated */
+
+ int32_t dsbt_index; /* DSBT index requested/assigned */
+ uint32_t dsbt_size; /* DSBT size for this module */
+ int32_t dsbt_base_tagidx;/* Location of DSBT base dyn tag */
+
+ int32_t preinit_array_idx; /* DT_PREINIT_ARRAY dyn tag loc */
+ int32_t preinit_arraysz; /* sizeof pre-init array */
+ int32_t init_idx; /* DT_INIT dynamic tag location */
+ int32_t init_array_idx; /* DT_INIT_ARRAY dyn tag location */
+ int32_t init_arraysz; /* sizeof init array */
+
+} DLIMP_Dynamic_Module;
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_dependency_stack */
+/* */
+/* A LIFO stack of dynamic module objects (DLIMP_Dynamic_Module *) that */
+/* is retained while dependent files are being loaded and allocated. It */
+/* is used to guide which dynamic modules need to be relocated after all */
+/* items in the dependency graph have been allocated. The stack is only */
+/* used when the client asks the core loader to load a dynamic executable */
+/* or library. When relocation is completed, this stack should be empty. */
+/*---------------------------------------------------------------------------*/
+TYPE_STACK_DEFINITION(DLIMP_Dynamic_Module*, dynamic_module_ptr)
+extern dynamic_module_ptr_Stack DLIMP_dependency_stack;
+
+/*---------------------------------------------------------------------------*/
+/* Private Loader Object instance. */
+/*---------------------------------------------------------------------------*/
+typedef struct
+{
+ /*-----------------------------------------------------------------------*/
+ /* Contains filenames (type const char*) the system is in the process of */
+ /* loading. Used to detect cycles in incorrectly compiled ELF binaries. */
+ /*-----------------------------------------------------------------------*/
+ Array_List DLIMP_module_dependency_list;
+
+ /*-----------------------------------------------------------------------*/
+ /* Contains objects (type DLIMP_Loaded_Module) that the system has loaded*/
+ /* into target memory. */
+ /*-----------------------------------------------------------------------*/
+ loaded_module_ptr_Queue DLIMP_loaded_objects;
+
+ /*-----------------------------------------------------------------------*/
+ /* Dependency Graph Queue - FIFO queue of dynamic modules that are loaded*/
+ /* when client asks to load a dynamic executable or library. Note that */
+ /* dependents that have already been loaded with another module will not */
+ /* appear on this queue. */
+ /*-----------------------------------------------------------------------*/
+ dynamic_module_ptr_Stack DLIMP_dependency_stack;
+
+ /*-----------------------------------------------------------------------*/
+ /* Counter for generating unique IDs for file handles. */
+ /* NOTE: File handle is assigned sequencially but is never reclaimed */
+ /* when the modules are unloaded. It is conceivable that a loader*/
+ /* running for a long time and loading and unloading modules */
+ /* could wrap-around. The loader generates error in this case. */
+ /* Presumably each loader instance has a list of file handles, one for */
+ /* each file that it loads, and the file handle serves as an index into */
+ /* the list. Therefore even if the same file is loaded by two loader */
+ /* instances, both loader instances have a different file handle for the */
+ /* file - the file is mapped uniquely to it's appopriate file handle per */
+ /* loader instance. */
+ /*-----------------------------------------------------------------------*/
+ int32_t file_handle;
+
+ /*-----------------------------------------------------------------------*/
+ /* Client token, passed in via DLOAD_create() */
+ /*-----------------------------------------------------------------------*/
+ void * client_handle;
+} LOADER_OBJECT;
+
+
+/*****************************************************************************/
+/* IF data : Below are the data structures used to store init-fini data. */
+/*****************************************************************************/
+typedef struct
+{
+ TARGET_ADDRESS sect_addr;
+ int32_t size;
+}
+IF_single_record;
+
+TYPE_QUEUE_DEFINITION(IF_single_record*, IF_table)
+extern IF_table_Queue TI_init_table;
+
+
+/*****************************************************************************/
+/* Container used to read in argc, argv from the .srgs section. */
+/*****************************************************************************/
+typedef struct { int argc; char *argv[1]; } ARGS_CONTAINER;
+
+
+/*****************************************************************************/
+/* is_DSBT_module() */
+/* */
+/* return true if the module uses DSBT model */
+/*****************************************************************************/
+static inline BOOL is_dsbt_module(DLIMP_Dynamic_Module *dyn_module)
+{
+ return (dyn_module->dsbt_size != 0);
+}
+
+/*****************************************************************************/
+/* is_arm_module() */
+/* */
+/* return true if the module being processed is for ARM */
+/*****************************************************************************/
+static inline BOOL is_arm_module(struct Elf32_Ehdr* fhdr)
+{
+ return fhdr->e_machine == EM_ARM;
+}
+
+/*****************************************************************************/
+/* is_c60_module() */
+/* */
+/* return true if the module being processed is for C60 */
+/*****************************************************************************/
+static inline BOOL is_c60_module(struct Elf32_Ehdr* fhdr)
+{
+ return fhdr->e_machine == EM_TI_C6000;
+}
+
+/*---------------------------------------------------------------------------*/
+/* DLIMP_update_dyntag_section_address() */
+/* */
+/* Given the index of a dynamic tag which we happen to know points to a */
+/* section address, find the program header table entry associated with */
+/* the specified address and update the tag value with the real address */
+/* of the section. */
+/* */
+/*---------------------------------------------------------------------------*/
+extern BOOL DLIMP_update_dyntag_section_address(DLIMP_Dynamic_Module *dyn_module,
+ int32_t i);
+
+extern uint32_t DLIMP_get_first_dyntag(int tag, struct Elf32_Dyn* dyn_table);
+
+/*---------------------------------------------------------------------------*/
+/* Global flags to help manage internal debug and profiling efforts. */
+/*---------------------------------------------------------------------------*/
+#ifndef __TI_COMPILER_VERSION__
+#define LOADER_DEBUG 1
+#else
+#define LOADER_DEBUG 0
+#endif
+
+#undef LOADER_DEBUG
+
+#define LOADER_DEBUG 1
+#define LOADER_PROFILE 1
+
+#if LOADER_DEBUG
+extern BOOL debugging_on;
+#endif
+
+#if LOADER_DEBUG || LOADER_PROFILE
+extern BOOL profiling_on;
+#endif
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.c b/src/core/dsp/ocl_load/DLOAD/dload_endian.c
new file mode 100644
index 0000000..ac6413b
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.c
@@ -0,0 +1,151 @@
+/*
+* dload_endian.c
+*
+* Simple helper functions to assist core loader with endian-ness issues
+* when the host endian-ness may be opposite the endian-ness of the target.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "dload_endian.h"
+
+/*****************************************************************************/
+/* DLIMP_GET_ENDIAN() - Determine endianness of the host. Uses ELF */
+/* endianness constants. */
+/*****************************************************************************/
+int DLIMP_get_endian()
+{
+ int32_t x = 0x1;
+
+ if (*((int16_t*)(&x))) return ELFDATA2LSB;
+
+ return ELFDATA2MSB;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_ENDIAN32() - Swap endianness of a 32-bit integer. */
+/*****************************************************************************/
+void DLIMP_change_endian32(int32_t* to_change)
+{
+ int32_t temp = 0;
+ temp += (*to_change & 0x000000FF) << 24;
+ temp += (*to_change & 0x0000FF00) << 8;
+ temp += (*to_change & 0x00FF0000) >> 8;
+ temp += (*to_change & 0xFF000000) >> 24;
+ *to_change = temp;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_ENDIAN16() - Swap endianness of a 16-bit integer. */
+/*****************************************************************************/
+void DLIMP_change_endian16(int16_t* to_change)
+{
+ int16_t temp = 0;
+ temp += (*to_change & 0x00FF) << 8;
+ temp += (*to_change & 0xFF00) >> 8;
+ *to_change = temp;
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_EHDR_ENDIAN() - Swap endianness of an ELF file header. */
+/*****************************************************************************/
+void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* ehdr)
+{
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_type));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_machine));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_version));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_entry));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_phoff));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_shoff));
+ DLIMP_change_endian32((int32_t*)(&ehdr->e_flags));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_ehsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_phentsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_phnum));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shentsize));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shnum));
+ DLIMP_change_endian16((int16_t*)(&ehdr->e_shstrndx));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_PHDR_ENDIAN() - Swap endianness of an ELF program header. */
+/*****************************************************************************/
+void DLIMP_change_phdr_endian(struct Elf32_Phdr* phdr)
+{
+ DLIMP_change_endian32((int32_t*)(&phdr->p_type));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_offset));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_vaddr));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_paddr));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_filesz));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_memsz));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_flags));
+ DLIMP_change_endian32((int32_t*)(&phdr->p_align));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_DYNENT_ENDIAN() - Swap endianness of a dynamic table entry. */
+/*****************************************************************************/
+void DLIMP_change_dynent_endian(struct Elf32_Dyn* dyn)
+{
+ DLIMP_change_endian32((int32_t*)(&dyn->d_tag));
+ DLIMP_change_endian32((int32_t*)(&dyn->d_un.d_val));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_SYM_ENDIAN() - Swap endianness of an ELF symbol table entry. */
+/*****************************************************************************/
+void DLIMP_change_sym_endian(struct Elf32_Sym* sym)
+{
+ DLIMP_change_endian32((int32_t*)(&sym->st_name));
+ DLIMP_change_endian32((int32_t*)(&sym->st_value));
+ DLIMP_change_endian32((int32_t*)(&sym->st_size));
+ DLIMP_change_endian16((int16_t*)(&sym->st_shndx));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_RELA_ENDIAN() - Swap endianness of a RELA-type relocation. */
+/*****************************************************************************/
+void DLIMP_change_rela_endian(struct Elf32_Rela* ra)
+{
+ DLIMP_change_endian32((int32_t*)(&ra->r_offset));
+ DLIMP_change_endian32((int32_t*)(&ra->r_info));
+ DLIMP_change_endian32((int32_t*)(&ra->r_addend));
+}
+
+/*****************************************************************************/
+/* DLIMP_CHANGE_REL_ENDIAN() - Swap endianness of a REL-type relocation. */
+/*****************************************************************************/
+void DLIMP_change_rel_endian(struct Elf32_Rel* r)
+{
+ DLIMP_change_endian32((int32_t*)(&r->r_offset));
+ DLIMP_change_endian32((int32_t*)(&r->r_info));
+}
diff --git a/src/core/dsp/ocl_load/DLOAD/dload_endian.h b/src/core/dsp/ocl_load/DLOAD/dload_endian.h
new file mode 100644
index 0000000..ee74e11
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/dload_endian.h
@@ -0,0 +1,58 @@
+/*
+* dload_endian.h
+*
+* Specification of functions used to assist loader with endian-ness issues.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_ENDIAN_H
+#define DLOAD_ENDIAN_H
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* Prototypes for ELF file object reader endianness swap routines. */
+/*---------------------------------------------------------------------------*/
+
+int DLIMP_get_endian(void);
+void DLIMP_change_endian32(int32_t* to_change);
+void DLIMP_change_endian16(int16_t* to_change);
+void DLIMP_change_ehdr_endian(struct Elf32_Ehdr* to_change);
+void DLIMP_change_phdr_endian(struct Elf32_Phdr* to_change);
+void DLIMP_change_dynent_endian(struct Elf32_Dyn* to_change);
+void DLIMP_change_sym_endian(struct Elf32_Sym* to_change);
+void DLIMP_change_rela_endian(struct Elf32_Rela* to_change);
+void DLIMP_change_rel_endian(struct Elf32_Rel* to_change);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.c b/src/core/dsp/ocl_load/DLOAD/elf32.c
new file mode 100644
index 0000000..082ba01
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/elf32.c
@@ -0,0 +1,652 @@
+/*
+* elf32.c
+*
+* Basic Data Structures for 32-Bit ELF Object Format Files
+*
+* The data structures in this file come primarily from this specification:
+*
+* Tool Interface Standard (TIS)
+* Executable and Linking Format (ELF) Specification
+* Version 1.2
+*
+* TIS Committee
+* May 1995
+*
+* Additions and enhancements from this specification are also included:
+*
+* System V Application Binary Interface
+* DRAFT 17
+* December 2003
+*
+* http://sco.com/developers/gabi/2003-12-17/contents.html
+*
+* This is a C implementation of the data base objects that are commonly
+* used in the source for TI development tools that support ELF.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "elf32.h"
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Tag Database */
+/*---------------------------------------------------------------------------*/
+
+const struct EDYN_TAG EDYN_TAG_DB[] =
+{
+ /* EDYN_TAG_NULL */
+ {
+ /* d_tag_name */ "DT_NULL",
+ /* d_tag_value */ DT_NULL,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_NEEDED */
+ {
+ /* d_tag_name */ "DT_NEEDED",
+ /* d_tag_value */ DT_NEEDED,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTRELSZ */
+ {
+ /* d_tag_name */ "DT_PLTRELSZ",
+ /* d_tag_value */ DT_PLTRELSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTGOT */
+ {
+ /* d_tag_name */ "DT_PLTGOT",
+ /* d_tag_value */ DT_PLTGOT,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_HASH */
+ {
+ /* d_tag_name */ "DT_HASH",
+ /* d_tag_value */ DT_HASH,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_STRTAB */
+ {
+ /* d_tag_name */ "DT_STRTAB",
+ /* d_tag_value */ DT_STRTAB,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_SYMTAB */
+ {
+ /* d_tag_name */ "DT_SYMTAB",
+ /* d_tag_value */ DT_SYMTAB,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_RELA */
+ {
+ /* d_tag_name */ "DT_RELA",
+ /* d_tag_value */ DT_RELA,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELASZ */
+ {
+ /* d_tag_name */ "DT_RELASZ",
+ /* d_tag_value */ DT_RELASZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELAENT */
+ {
+ /* d_tag_name */ "DT_RELAENT",
+ /* d_tag_value */ DT_RELAENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_STRSZ */
+ {
+ /* d_tag_name */ "DT_STRSZ",
+ /* d_tag_value */ DT_STRSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_SYMENT */
+ {
+ /* d_tag_name */ "DT_SYMENT",
+ /* d_tag_value */ DT_SYMENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_MANDATORY
+ },
+
+ /* EDYN_TAG_INIT */
+ {
+ /* d_tag_name */ "DT_INIT",
+ /* d_tag_value */ DT_INIT,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI */
+ {
+ /* d_tag_name */ "DT_FINI",
+ /* d_tag_value */ DT_FINI,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_SONAME */
+ {
+ /* d_tag_name */ "DT_SONAME",
+ /* d_tag_value */ DT_SONAME,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_IGNORED,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RPATH */
+ {
+ /* d_tag_name */ "DT_RPATH",
+ /* d_tag_value */ DT_RPATH,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_SYMBOLIC */
+ {
+ /* d_tag_name */ "DT_SYMBOLIC",
+ /* d_tag_value */ DT_SYMBOLIC,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_IGNORED,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_REL */
+ {
+ /* d_tag_name */ "DT_REL",
+ /* d_tag_value */ DT_REL,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELSZ */
+ {
+ /* d_tag_name */ "DT_RELSZ",
+ /* d_tag_value */ DT_RELSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RELENT */
+ {
+ /* d_tag_name */ "DT_RELENT",
+ /* d_tag_value */ DT_RELENT,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_MANDATORY,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_PLTREL */
+ {
+ /* d_tag_name */ "DT_PLTREL",
+ /* d_tag_value */ DT_PLTREL,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_DEBUG */
+ {
+ /* d_tag_name */ "DT_DEBUG",
+ /* d_tag_value */ DT_DEBUG,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_TEXTREL */
+ {
+ /* d_tag_name */ "DT_TEXTREL",
+ /* d_tag_value */ DT_TEXTREL,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_JMPREL */
+ {
+ /* d_tag_name */ "DT_JMPREL",
+ /* d_tag_value */ DT_JMPREL,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_BIND_NOW */
+ {
+ /* d_tag_name */ "DT_BIND_NOW",
+ /* d_tag_value */ DT_BIND_NOW,
+ /* d_untype */ EDYN_UNTYPE_IGNORED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_INIT_ARRAY */
+ {
+ /* d_tag_name */ "DT_INIT_ARRAY",
+ /* d_tag_value */ DT_INIT_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI_ARRAY */
+ {
+ /* d_tag_name */ "DT_FINI_ARRAY",
+ /* d_tag_value */ DT_FINI_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_INIT_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_INIT_ARRAYSZ",
+ /* d_tag_value */ DT_INIT_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FINI_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_FINI_ARRAYSZ",
+ /* d_tag_value */ DT_FINI_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_RUNPATH */
+ {
+ /* d_tag_name */ "DT_RUNPATH",
+ /* d_tag_value */ DT_RUNPATH,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_FLAGS */
+ {
+ /* d_tag_name */ "DT_FLAGS",
+ /* d_tag_value */ DT_FLAGS,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_OPTIONAL
+ },
+
+ /* EDYN_TAG_ENCODING */
+ {
+ /* d_tag_name */ "DT_ENCODING",
+ /* d_tag_value */ DT_ENCODING,
+ /* d_untype */ EDYN_UNTYPE_UNSPECIFIED,
+ /* d_exec_req */ EDYN_TAGREQ_UNSPECIFIED,
+ /* d_shared_req */ EDYN_TAGREQ_UNSPECIFIED
+ },
+
+ /* EDYN_TAG_PREINIT_ARRAY */
+ {
+ /* d_tag_name */ "DT_PREINIT_ARRAY",
+ /* d_tag_value */ DT_PREINIT_ARRAY,
+ /* d_untype */ EDYN_UNTYPE_PTR,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* EDYN_TAG_PREINIT_ARRAYSZ */
+ {
+ /* d_tag_name */ "DT_PREINIT_ARRAYSZ",
+ /* d_tag_value */ DT_PREINIT_ARRAYSZ,
+ /* d_untype */ EDYN_UNTYPE_VAL,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ },
+
+ /* Terminate array with an id of -1 */
+ {
+ /* d_tag_name */ "",
+ /* d_tag_value */ -1,
+ /* d_untype */ EDYN_UNTYPE_UNSPECIFIED,
+ /* d_exec_req */ EDYN_TAGREQ_OPTIONAL,
+ /* d_shared_req */ EDYN_TAGREQ_IGNORED
+ }
+};
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Database */
+/*---------------------------------------------------------------------------*/
+const struct ESCN ESCN_DB[] =
+{
+ /* .bss */
+ {
+ /* name */ ESCN_BSS_name,
+ /* sh_type */ SHT_NOBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .comment */
+ {
+ /* name */ ESCN_COMMENT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .data */
+ {
+ /* name */ ESCN_DATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .data1 */
+ {
+ /* name */ ESCN_DATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .debug */
+ {
+ /* name */ ESCN_DEBUG_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .dynamic */
+ {
+ /* name */ ESCN_DYNAMIC_name,
+ /* sh_type */ SHT_DYNAMIC,
+ /* sh_entsize */ sizeof(struct Elf32_Dyn),
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .dynstr */
+ {
+ /* name */ ESCN_DYNSTR_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_ALLOC + SHF_STRINGS
+ },
+
+ /* .dynsym */
+ {
+ /* name */ ESCN_DYNSYM_name,
+ /* sh_type */ SHT_DYNSYM,
+ /* sh_entsize */ sizeof(struct Elf32_Sym),
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .fini */
+ {
+ /* name */ ESCN_FINI_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+
+ /* .fini_array */
+ {
+ /* name */ ESCN_FINI_ARRAY_name,
+ /* sh_type */ SHT_FINI_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .got */
+ {
+ /* name */ ESCN_GOT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .hash */
+ {
+ /* name */ ESCN_HASH_name,
+ /* sh_type */ SHT_HASH,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .init */
+ {
+ /* name */ ESCN_INIT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+
+ /* .init_array */
+ {
+ /* name */ ESCN_INIT_ARRAY_name,
+ /* sh_type */ SHT_INIT_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .interp */
+ {
+ /* name */ ESCN_INTERP_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .line */
+ {
+ /* name */ ESCN_LINE_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .note */
+ {
+ /* name */ ESCN_NOTE_name,
+ /* sh_type */ SHT_NOTE,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .plt */
+ {
+ /* name */ ESCN_PLT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+
+ /* .preinit_array */
+ {
+ /* name */ ESCN_PREINIT_ARRAY_name,
+ /* sh_type */ SHT_PREINIT_ARRAY,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE
+ },
+
+ /* .rel */
+ {
+ /* name */ ESCN_REL_name,
+ /* sh_type */ SHT_REL,
+ /* sh_entsize */ sizeof(struct Elf32_Rel),
+ /* sh_flags */ 0
+ },
+
+ /* .rela */
+ {
+ /* name */ ESCN_RELA_name,
+ /* sh_type */ SHT_RELA,
+ /* sh_entsize */ sizeof(struct Elf32_Rela),
+ /* sh_flags */ 0
+ },
+
+ /* .rodata */
+ {
+ /* name */ ESCN_RODATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .rodata1 */
+ {
+ /* name */ ESCN_RODATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC
+ },
+
+ /* .shstrtab */
+ {
+ /* name */ ESCN_SHSTRTAB_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_STRINGS
+ },
+
+ /* .strtab */
+ {
+ /* name */ ESCN_STRTAB_name,
+ /* sh_type */ SHT_STRTAB,
+ /* sh_entsize */ sizeof(char),
+ /* sh_flags */ SHF_STRINGS
+ },
+
+ /* .symtab */
+ {
+ /* name */ ESCN_SYMTAB_name,
+ /* sh_type */ SHT_SYMTAB,
+ /* sh_entsize */ sizeof(struct Elf32_Sym),
+ /* sh_flags */ 0
+ },
+
+ /* .symtab_shndx */
+ {
+ /* name */ ESCN_SYMTAB_SHNDX_name,
+ /* sh_type */ SHT_SYMTAB_SHNDX,
+ /* sh_entsize */ sizeof(Elf32_Word),
+ /* sh_flags */ 0
+ },
+
+ /* .tbss */
+ {
+ /* name */ ESCN_TBSS_name,
+ /* sh_type */ SHT_NOBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .tdata */
+ {
+ /* name */ ESCN_TDATA_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .tdata1 */
+ {
+ /* name */ ESCN_TDATA1_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_WRITE + SHF_TLS
+ },
+
+ /* .text */
+ {
+ /* name */ ESCN_TEXT_name,
+ /* sh_type */ SHT_PROGBITS,
+ /* sh_entsize */ 0,
+ /* sh_flags */ SHF_ALLOC + SHF_EXECINSTR
+ },
+#if 0
+ /* .build.attributes */
+ {
+ /* name */ ESCN_ATTRIBUTES_name,
+ /* sh_type */ SHT_ATTRIBUTES,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ },
+#endif
+ /* Terminate array with a NULL name field */
+ {
+ /* name */ (const char*)0,
+ /* sh_type */ 0,
+ /* sh_entsize */ 0,
+ /* sh_flags */ 0
+ }
+};
+
diff --git a/src/core/dsp/ocl_load/DLOAD/elf32.h b/src/core/dsp/ocl_load/DLOAD/elf32.h
new file mode 100644
index 0000000..67358d6
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/elf32.h
@@ -0,0 +1,756 @@
+/*
+* elf32.h
+*
+* Basic Data Structures for 32-bit ELF Object Format Files
+*
+* The data structures in this file come primarily from this specification:
+*
+* Tool Interface Standard (TIS)
+* Executable and Linking Format (ELF) Specification
+* Version 1.2
+*
+* TIS Committee
+* May 1995
+*
+* Additions and enhancements from this specification are also included:
+*
+* System V Application Binary Interface
+* DRAFT 17
+* December 2003
+*
+* http://sco.com/developers/gabi/2003-12-17/contents.html
+*
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef ELF32_H
+#define ELF32_H
+
+#include <inttypes.h>
+
+/*---------------------------------------------------------------------------*/
+/* 32-Bit Data Types (Figure 1-2, page 1-2) */
+/*---------------------------------------------------------------------------*/
+typedef uint32_t Elf32_Addr;
+typedef uint16_t Elf32_Half;
+typedef uint32_t Elf32_Off;
+typedef int32_t Elf32_Sword;
+typedef uint32_t Elf32_Word;
+
+
+/*****************************************************************************/
+/* ELF Header */
+/* PP. 1-4 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* ELF Identification Indexes (indexes into Elf32_Ehdr.e_ident[] below) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EI_MAG0 = 0, /* File identification */
+ EI_MAG1 = 1, /* File identification */
+ EI_MAG2 = 2, /* File identification */
+ EI_MAG3 = 3, /* File identification */
+ EI_CLASS = 4, /* File class */
+ EI_DATA = 5, /* Data encoding */
+ EI_VERSION = 6, /* File version */
+ EI_OSABI = 7, /* Operating system / ABI */
+ EI_ABIVERSION = 8, /* ABI version */
+ EI_PAD = 9, /* Start of padding bytes */
+ EI_NIDENT = 16 /* Size of Elf32_Ehdr.e_ident[] */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* ELF Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Ehdr
+{
+ uint8_t e_ident[EI_NIDENT]; /* ELF Magic Number */
+ Elf32_Half e_type; /* Object File Type */
+ Elf32_Half e_machine; /* Target Processor */
+ Elf32_Word e_version; /* Object File Version */
+ Elf32_Addr e_entry; /* Entry Point */
+ Elf32_Off e_phoff; /* Program Header Table Offset */
+ Elf32_Off e_shoff; /* Section Header Table Offset */
+ Elf32_Word e_flags; /* Processor-Specific Flags */
+ Elf32_Half e_ehsize; /* Size of ELF header */
+ Elf32_Half e_phentsize; /* Size of a Program Header */
+ Elf32_Half e_phnum; /* # Entries in Program Header Table */
+ Elf32_Half e_shentsize; /* Size of a Section Header */
+ Elf32_Half e_shnum; /* # Entries in Section Header Table */
+ Elf32_Half e_shstrndx; /* Section Name String Table Section */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Object File Types (value of "e_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ ET_NONE = 0, /* No file type */
+ ET_REL = 1, /* Relocatable file */
+ ET_EXEC = 2, /* Executable file */
+ ET_DYN = 3, /* Shared object file */
+ ET_CORE = 4, /* Core file */
+ ET_LOOS = 0xfe00, /* First OS-specific value */
+ ET_HIPS = 0xfeff, /* Last OS-specific value */
+ ET_LOPROC = 0xff00, /* First processor-specific value */
+ ET_HIPROC = 0xffff /* Last processor-specific value */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Target Processors (value of "e_machine") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EM_NONE = 0, /* No machine */
+ EM_M32 = 1, /* AT&T WE 32100 */
+ EM_SPARC = 2, /* SPARC */
+ EM_386 = 3, /* Intel 80386 */
+ EM_68K = 4, /* Motorola 68000 */
+ EM_88K = 5, /* Motorola 88000 */
+ EM_860 = 7, /* Intel 80860 */
+ EM_MIPS = 8, /* MIPS I Architecture */
+ EM_S370 = 9, /* IBM System/370 Processor */
+ EM_MIPS_RS3_LE = 10, /* MIPS RS3000 Little-endian */
+ EM_PARISC = 15, /* Hewlett-Packard PA-RISC */
+ EM_VPP500 = 17, /* Fujitsu VPP500 */
+ EM_SPARC32PLUS = 18, /* Enhanced instruction set SPARC */
+ EM_960 = 19, /* Intel 80960 */
+ EM_PPC = 20, /* PowerPC */
+ EM_PPC64 = 21, /* 64-bit PowerPC */
+ EM_S390 = 22, /* IBM System/390 Processor */
+ EM_V800 = 36, /* NEC V800 */
+ EM_FR20 = 37, /* Fujitsu FR20 */
+ EM_RH32 = 38, /* TRW RH-32 */
+ EM_RCE = 39, /* Motorola RCE */
+ EM_ARM = 40, /* Advanced RISC Machines ARM */
+ EM_ALPHA = 41, /* Digital Alpha */
+ EM_SH = 42, /* Hitachi SH */
+ EM_SPARCV9 = 43, /* SPARC Version 9 */
+ EM_TRICORE = 44, /* Siemens TriCore embedded processor */
+ EM_ARC = 45, /* "Argonaut RISC Core, Argonaut Technologies Inc. */
+ EM_H8_300 = 46, /* Hitachi H8/300 */
+ EM_H8_300H = 47, /* Hitachi H8/300H */
+ EM_H8S = 48, /* Hitachi H8S */
+ EM_H8_500 = 49, /* Hitachi H8/500 */
+ EM_IA_64 = 50, /* Intel IA-64 processor architecture */
+ EM_MIPS_X = 51, /* Stanford MIPS-X */
+ EM_COLDFIRE = 52, /* Motorola ColdFire */
+ EM_68HC12 = 53, /* Motorola M68HC12 */
+ EM_MMA = 54, /* Fujitsu MMA Multimedia Accelerator */
+ EM_PCP = 55, /* Siemens PCP */
+ EM_NCPU = 56, /* Sony nCPU embedded RISC processor */
+ EM_NDR1 = 57, /* Denso NDR1 microprocessor */
+ EM_STARCORE = 58, /* Motorola Star*Core processor */
+ EM_ME16 = 59, /* Toyota ME16 processor */
+ EM_ST100 = 60, /* STMicroelectronics ST100 processor */
+ EM_TINYJ = 61, /* Advanced Logic Corp. TinyJ embedded processor f */
+ EM_X86_64 = 62, /* AMD x86-64 architecture */
+ EM_PDSP = 63, /* Sony DSP Processor */
+ EM_PDP10 = 64, /* Digital Equipment Corp. PDP-10 */
+ EM_PDP11 = 65, /* Digital Equipment Corp. PDP-11 */
+ EM_FX66 = 66, /* Siemens FX66 microcontroller */
+ EM_ST9PLUS = 67, /* STMicroelectronics ST9+ 8/16 bit microcontrolle */
+ EM_ST7 = 68, /* STMicroelectronics ST7 8-bit microcontroller */
+ EM_68HC16 = 69, /* Motorola MC68HC16 Microcontroller */
+ EM_68HC11 = 70, /* Motorola MC68HC11 Microcontroller */
+ EM_68HC08 = 71, /* Motorola MC68HC08 Microcontroller */
+ EM_68HC05 = 72, /* Motorola MC68HC05 Microcontroller */
+ EM_SVX = 73, /* Silicon Graphics SVx */
+ EM_ST19 = 74, /* STMicroelectronics ST19 8-bit microcontroller */
+ EM_VAX = 75, /* Digital VAX */
+ EM_CRIS = 76, /* Axis Communications 32-bit embedded processor */
+ EM_JAVELIN = 77, /* Infineon Technologies 32-bit embedded processor */
+ EM_FIREPATH = 78, /* Element 14 64-bit DSP Processor */
+ EM_ZSP = 79, /* LSI Logic 16-bit DSP Processor */
+ EM_MMIX = 80, /* Donald Knuth's educational 64-bit processor */
+ EM_HUANY = 81, /* Harvard University machine-independent object f */
+ EM_PRISM = 82, /* SiTera Prism */
+ EM_AVR = 83, /* Atmel AVR 8-bit microcontroller */
+ EM_FR30 = 84, /* Fujitsu FR30 */
+ EM_D10V = 85, /* Mitsubishi D10V */
+ EM_D30V = 86, /* Mitsubishi D30V */
+ EM_V850 = 87, /* NEC v850 */
+ EM_M32R = 88, /* Mitsubishi M32R */
+ EM_MN10300 = 89, /* Matsushita MN10300 */
+ EM_MN10200 = 90, /* Matsushita MN10200 */
+ EM_PJ = 91, /* picoJava */
+ EM_OPENRISC = 92, /* OpenRISC 32-bit embedded processor */
+ EM_ARC_A5 = 93, /* ARC Cores Tangent-A5 */
+ EM_XTENSA = 94, /* Tensilica Xtensa Architecture */
+ EM_VIDEOCORE = 95, /* Alphamosaic VideoCore processor */
+ EM_TMM_GPP = 96, /* Thompson Multimedia General Purpose Processor */
+ EM_NS32K = 97, /* National Semiconductor 32000 series */
+ EM_TPC = 98, /* Tenor Network TPC processor */
+ EM_SNP1K = 99, /* Trebia SNP 1000 processor */
+ EM_ST200 = 100, /* STMicroelectronics (www.st.com) ST200 microcont */
+ EM_IP2K = 101, /* Ubicom IP2xxx microcontroller family */
+ EM_MAX = 102, /* MAX Processor */
+ EM_CR = 103, /* National Semiconductor CompactRISC microprocess */
+ EM_F2MC16 = 104, /* Fujitsu F2MC16 */
+ EM_MSP430 = 105, /* Texas Instruments embedded microcontroller msp4 */
+ EM_BLACKFIN = 106, /* Analog Devices Blackfin (DSP) processor */
+ EM_SE_C33 = 107, /* S1C33 Family of Seiko Epson processors */
+ EM_SEP = 108, /* Sharp embedded microprocessor */
+ EM_ARCA = 109, /* Arca RISC Microprocessor */
+ EM_UNICORE = 110, /* Microprocessor series from PKU-Unity Ltd. and M */
+
+ /*------------------------------------------------------------------------*/
+ /* ELF Magic Numbers Reserved For Texas Instruments */
+ /* */
+ /* The magic numbers 140-159 were reserved through SCO to be included */
+ /* in the official ELF specification. Please see Don Darling */
+ /* regarding any changes or allocation of the numbers below. */
+ /* */
+ /* When we allocate a number for use, SCO needs to be notified so they */
+ /* can update the ELF specification accordingly. */
+ /*------------------------------------------------------------------------*/
+ EM_TI_C6000 = 140, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED02 = 141, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED03 = 142, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED04 = 143, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED05 = 144, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED06 = 145, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED07 = 146, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED08 = 147, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED09 = 148, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED10 = 149, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED11 = 150, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED12 = 151, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED13 = 152, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED14 = 153, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED15 = 154, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED16 = 155, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED17 = 156, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED18 = 157, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED19 = 158, /* Reserved for Texas Instruments; unused */
+ EM_TI_UNUSED20 = 159 /* Reserved for Texas Instruments; unused */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Object File Version (value of "e_version") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ EV_NONE = 0, /* Invalid version */
+ EV_CURRENT = 1 /* Current version */
+};
+
+
+/*****************************************************************************/
+/* ELF Identification */
+/* PP. 1-6 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Identification Values for ELF Files */
+/*---------------------------------------------------------------------------*/
+
+/* EI_MAG0 to EI_MAG3 */
+enum
+{
+ ELFMAG0 = 0x7f, /* e_ident[EI_MAG0] */
+ ELFMAG1 = 'E', /* e_ident[EI_MAG1] */
+ ELFMAG2 = 'L', /* e_ident[EI_MAG2] */
+ ELFMAG3 = 'F' /* e_ident[EI_MAG3] */
+};
+
+/* EI_CLASS */
+enum
+{
+ ELFCLASSNONE = 0, /* Invalid class */
+ ELFCLASS32 = 1, /* 32-bit objects */
+ ELFCLASS64 = 2 /* 64-bit objects */
+};
+
+/* EI_DATA */
+enum
+{
+ ELFDATANONE = 0, /* Invalid data encoding */
+ ELFDATA2LSB = 1, /* Little-endian data */
+ ELFDATA2MSB = 2 /* Big-endian data */
+};
+
+/* EI_OSABI */
+enum
+{
+ ELFOSABI_NONE = 0, /* No extensions or unspecified */
+ ELFOSABI_HPUX = 1, /* Hewlett-Packard HP-UX */
+ ELFOSABI_NETBSD = 2, /* NetBSD */
+ ELFOSABI_LINUX = 3, /* Linux */
+ ELFOSABI_SOLARIS = 6, /* Sun Solaris */
+ ELFOSABI_AIX = 7, /* AIX */
+ ELFOSABI_IRIX = 8, /* IRIX */
+ ELFOSABI_FREEBSD = 9, /* FreeBSD */
+ ELFOSABI_TRU64 = 10, /* Compaq TRU64 UNIX */
+ ELFOSABI_MODESTO = 11, /* Novell Modesto */
+ ELFOSABI_OPENBSD = 12, /* Open BSD */
+ ELFOSABI_OPENVMS = 13, /* Open VMS */
+ ELFOSABI_NSK = 14, /* Hewlett-Packard Non-Stop Kernel */
+ ELFOSABI_AROS = 15 /* Amiga Research OS */
+};
+
+/*****************************************************************************/
+/* Program Header */
+/* PP. 2-2 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Program Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Phdr
+{
+ Elf32_Word p_type; /* Segment type */
+ Elf32_Off p_offset; /* Segment file offset */
+ Elf32_Addr p_vaddr; /* Segment virtual address */
+ Elf32_Addr p_paddr; /* Segment physical address */
+ Elf32_Word p_filesz; /* Segment file image size */
+ Elf32_Word p_memsz; /* Segment memory image size */
+ Elf32_Word p_flags; /* Segment flags */
+ Elf32_Word p_align; /* Segment alignment */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Types (value of "p_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PT_NULL = 0, /* Unused table entry */
+ PT_LOAD = 1, /* Loadable segment */
+ PT_DYNAMIC = 2, /* Dynamic linking information */
+ PT_INTERP = 3, /* Interpreter path string location */
+ PT_NOTE = 4, /* Location and size of auxiliary information */
+ PT_SHLIB = 5, /* Shared library information */
+ PT_PHDR = 6, /* Location and size of program header table */
+ PT_TLS = 7, /* Specifies the Thread-Local Storage template */
+ PT_LOOS = 0x60000000, /* First OS-specific value */
+ PT_HIOS = 0x6fffffff, /* Last OS-specific value */
+ PT_LOPROC = 0x70000000, /* First processor-specific value */
+ PT_HIPROC = 0x7fffffff /* Last processor-specific value */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Segment Permissions (value of "p_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ PF_X = 0x1, /* Execute */
+ PF_W = 0x2, /* Write */
+ PF_R = 0x4, /* Read */
+ PF_MASKOS = 0x0ff00000, /* OS-specific mask */
+ PF_MASKPROC = 0xf0000000 /* Processor-specific mask */
+};
+
+/*****************************************************************************/
+/* Sections */
+/* PP. 1-9 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Section Header Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Shdr
+{
+ Elf32_Word sh_name; /* Section name (offset into string section) */
+ Elf32_Word sh_type; /* Section type */
+ Elf32_Word sh_flags; /* Section flags */
+ Elf32_Addr sh_addr; /* Address in memory image */
+ Elf32_Off sh_offset; /* File offset of section data */
+ Elf32_Word sh_size; /* Size of the section in bytes */
+ Elf32_Word sh_link; /* Link to the section header table */
+ Elf32_Word sh_info; /* Extra information depending on section type */
+ Elf32_Word sh_addralign; /* Address alignment constraints */
+ Elf32_Word sh_entsize; /* Size of fixed-size entries in section */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Indexes */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHN_UNDEF = 0, /* Referenced by undefined values */
+ SHN_LORESERVE = 0xff00, /* First reserved index */
+ SHN_LOPROC = 0xff00, /* First processor-specific index */
+ SHN_HIPROC = 0xff1f, /* Last processor-specific index */
+ SHN_LOOS = 0xff20, /* First OS-specific index */
+ SHN_HIOS = 0xff3f, /* Last OS-specific index */
+ SHN_ABS = 0xfff1, /* Referenced by absolute values */
+ SHN_COMMON = 0xfff2, /* Referenced by common values */
+ SHN_XINDEX = 0xffff, /* Indirect index reference (escape value) */
+ SHN_HIRESERVE = 0xffff /* Last reserved index */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Types (value of "sh_type") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHT_NULL = 0, /* Inactive section */
+ SHT_PROGBITS = 1, /* Application-specific information */
+ SHT_SYMTAB = 2, /* Symbol table */
+ SHT_STRTAB = 3, /* String table */
+ SHT_RELA = 4, /* Relocation entries (explicit addends) */
+ SHT_HASH = 5, /* Symbol hash table */
+ SHT_DYNAMIC = 6, /* Dynamic linking information */
+ SHT_NOTE = 7, /* Miscellaneous information */
+ SHT_NOBITS = 8, /* Contains no data in file */
+ SHT_REL = 9, /* Relocation entries (no expl. addends) */
+ SHT_SHLIB = 10, /* Shared library */
+ SHT_DYNSYM = 11, /* Dynamic symbol table */
+ SHT_INIT_ARRAY = 14, /* Pointers to initialization functions */
+ SHT_FINI_ARRAY = 15, /* Pointers to termination functions */
+ SHT_PREINIT_ARRAY = 16, /* Pointers to pre-init functions */
+ SHT_GROUP = 17, /* Section group */
+ SHT_SYMTAB_SHNDX = 18, /* Section indexes for SHN_XINDEX refs. */
+ SHT_LOOS = 0x60000000, /* First OS-specific type */
+ SHT_HIOS = 0x6fffffff, /* Last OS-specific type */
+ SHT_LOPROC = 0x70000000, /* First processor-specific type */
+ SHT_HIPROC = 0x7fffffff, /* Last processor-specific type */
+ SHT_LOUSER = 0x80000000, /* First application-specific type */
+ SHT_HIUSER = 0xffffffff /* Last application-specific type */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Attribute Flags (value of "sh_flags") */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ SHF_WRITE = 0x1, /* Writable during process execution */
+ SHF_ALLOC = 0x2, /* Loaded into processor memory */
+ SHF_EXECINSTR = 0x4, /* Contains executable instructions */
+ SHF_MERGE = 0x10, /* Can be merged */
+ SHF_STRINGS = 0x20, /* Contains null-terminated strings */
+ SHF_INFO_LINK = 0x40, /* sh_info contains a section index */
+ SHF_LINK_ORDER = 0x80, /* Maintain section ordering */
+ SHF_OS_NONCONFORMING = 0x100, /* OS-specific processing required */
+ SHF_GROUP = 0x200, /* Member of a section group */
+ SHF_TLS = 0x400, /* Contains Thread-Local Storage */
+ SHF_MASKOS = 0x0ff00000, /* Mask of OS-specific flags */
+ SHF_MASKPROC = 0xf0000000 /* Mask for processor-specific flags */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Section Group Flags */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ GRP_COMDAT = 0x1, /* Common data; only one is kept by linker */
+ GRP_MASKOS = 0x0ff00000, /* Mask for OS-specific group flags */
+ GRP_MASKPROC = 0xf0000000 /* Mask for processor-specific group flags */
+};
+
+
+/*****************************************************************************/
+/* Symbol Table */
+/* PP. 1-18 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Table Entry Data Structure */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Sym
+{
+ Elf32_Word st_name; /* String table offset for symbol name */
+ Elf32_Addr st_value; /* Symbol value */
+ Elf32_Word st_size; /* Symbol size */
+ uint8_t st_info; /* Symbol type and binding */
+ uint8_t st_other; /* Symbol visibility */
+ Elf32_Half st_shndx; /* Symbol type / defining section */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Undefined Symbol Index */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STN_UNDEF = 0 /* First symbol table entry is always undefined */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Binding and Type Utility Functions. */
+/*---------------------------------------------------------------------------*/
+static inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4); }
+static inline uint8_t ELF32_ST_TYPE(uint8_t i) { return (i & 0xf); }
+static inline uint8_t ELF32_ST_INFO(uint8_t b, uint8_t t)
+ { return ((b << 4) + (t & 0xf)); }
+static inline uint8_t ELF32_ST_VISIBILITY(uint8_t o) { return (o & 0x3); }
+
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Binding (value returned by ELF32_ST_BIND()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STB_LOCAL = 0, /* Symbol does not have external linkage */
+ STB_GLOBAL = 1, /* Symbol has external linkage */
+ STB_WEAK = 2, /* Symbol has weak external linkage */
+ STB_LOOS = 10, /* First OS-specific binding */
+ STB_HIOS = 12, /* Last OS-specific binding */
+ STB_LOPROC = 13, /* First processor-specific binding */
+ STB_HIPROC = 15 /* Last processor-specific binding */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Types (value returned by ELF32_ST_TYPE()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STT_NOTYPE = 0, /* Unspecified type */
+ STT_OBJECT = 1, /* Associated with a data object */
+ STT_FUNC = 2, /* Associated with executable code */
+ STT_SECTION = 3, /* Associated with a section */
+ STT_FILE = 4, /* Associated with a source file */
+ STT_COMMON = 5, /* Labels an uninitialized common block */
+ STT_TLS = 6, /* Specifies a thread-local storage entity */
+ STT_LOOS = 10, /* First OS-specific type */
+ STT_HIOS = 12, /* Last OS-specific type */
+ STT_LOPROC = 13, /* First processor-specific type */
+ STT_HIPROC = 15 /* Last processor-specific type */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Symbol Visibility (value returned by ELF32_ST_VISIBILITY()) */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ STV_DEFAULT = 0, /* Visibility specified by binding type */
+ STV_INTERNAL = 1, /* Like STV_HIDDEN, with processor-specific semantics */
+ STV_HIDDEN = 2, /* Not visible to other components */
+ STV_PROTECTED = 3 /* Visible in other components but not preemptable */
+};
+
+/*****************************************************************************/
+/* Relocation */
+/* PP. 1-22 */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Relocation Entries Data Structures */
+/*---------------------------------------------------------------------------*/
+struct Elf32_Rel
+{
+ Elf32_Addr r_offset; /* Offset of the relocatable value in the section */
+ Elf32_Word r_info; /* Symbol table index and relocation type */
+};
+
+struct Elf32_Rela
+{
+ Elf32_Addr r_offset; /* Offset of the relocatable value in the section */
+ Elf32_Word r_info; /* Symbol table index and relocation type */
+ Elf32_Sword r_addend; /* Constant addend used to compute new value */
+};
+
+/*---------------------------------------------------------------------------*/
+/* Relocation Symbol and Type Utility Functions. */
+/*---------------------------------------------------------------------------*/
+static inline uint32_t ELF32_R_SYM(uint32_t i) { return (i >> 8); }
+static inline uint8_t ELF32_R_TYPE(uint32_t i) { return (i & 0xFF); }
+static inline uint32_t ELF32_R_INFO(uint32_t s, uint8_t t)
+ { return ((s << 8) + t); }
+
+
+/*****************************************************************************/
+/* Dynamic Section */
+/* PP. 2-8 */
+/*****************************************************************************/
+struct Elf32_Dyn
+{
+ Elf32_Sword d_tag;
+ union
+ {
+ Elf32_Word d_val;
+ Elf32_Addr d_ptr;
+ } d_un;
+};
+
+/* Name Value d_un Executable Shared Obj. */
+/* ---- ----- ---- ---------- ----------- */
+enum
+{
+ DT_NULL = 0, /* ignored mandatory mandatory */
+ DT_NEEDED = 1, /* d_val optional optional */
+ DT_PLTRELSZ = 2, /* d_val optional optional */
+ DT_PLTGOT = 3, /* d_ptr optional optional */
+ DT_HASH = 4, /* d_ptr mandatory mandatory */
+ DT_STRTAB = 5, /* d_ptr mandatory mandatory */
+ DT_SYMTAB = 6, /* d_ptr mandatory mandatory */
+ DT_RELA = 7, /* d_ptr mandatory optional */
+ DT_RELASZ = 8, /* d_val mandatory optional */
+ DT_RELAENT = 9, /* d_val mandatory optional */
+ DT_STRSZ = 10, /* d_val mandatory mandatory */
+ DT_SYMENT = 11, /* d_val mandatory mandatory */
+ DT_INIT = 12, /* d_ptr optional optional */
+ DT_FINI = 13, /* d_ptr optional optional */
+ DT_SONAME = 14, /* d_val ignored optional */
+ DT_RPATH = 15, /* d_val optional ignored */
+ DT_SYMBOLIC = 16, /* ignored ignored optional */
+ DT_REL = 17, /* d_ptr mandatory optional */
+ DT_RELSZ = 18, /* d_val mandatory optional */
+ DT_RELENT = 19, /* d_val mandatory optional */
+ DT_PLTREL = 20, /* d_val optional optional */
+ DT_DEBUG = 21, /* d_ptr optional ignored */
+ DT_TEXTREL = 22, /* ignored optional optional */
+ DT_JMPREL = 23, /* d_ptr optional optional */
+ DT_BIND_NOW = 24, /* ignored optional optional */
+ DT_INIT_ARRAY = 25, /* d_ptr optional optional */
+ DT_FINI_ARRAY = 26, /* d_ptr optional optional */
+ DT_INIT_ARRAYSZ = 27, /* d_val optional optional */
+ DT_FINI_ARRAYSZ = 28, /* d_val optional optional */
+ DT_RUNPATH = 29, /* d_val optional optional */
+ DT_FLAGS = 30, /* d_val optional optional */
+ DT_ENCODING = 32, /* unspecified unspecified unspecified */
+ DT_PREINIT_ARRAY = 32, /* d_ptr optional ignored */
+ DT_PREINIT_ARRAYSZ = 33, /* d_val optional ignored */
+ DT_LOOS = 0x60000000, /* unspecified unspecified unspecified */
+ DT_HIOS = 0x6ffff000, /* unspecified unspecified unspecified */
+ DT_LOPROC = 0x70000000, /* unspecified unspecified unspecified */
+ DT_HIPROC = 0x7fffffff /* unspecified unspecified unspecified */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* DT_FLAGS values. */
+/*---------------------------------------------------------------------------*/
+enum
+{
+ DF_ORIGIN = 0x01, /* loaded object may reference $ORIGIN subst. string */
+ DF_SYMBOLIC = 0x02, /* changes dynamic linker symbol resolution */
+ DF_TEXTREL = 0x04, /* do not allow relocation of non-writable segments */
+ DF_BIND_NOW = 0x08, /* don't use lazy binding */
+ DF_STATIC_TLS = 0x10, /* do not load this file dynamically */
+ DF_DIRECT_DEPENDENT = 0x20, /* limit global sym lookup to dependent list */
+ DF_WORLD = 0x40 /* Linux style global sym lookup, breadth-first */
+};
+
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Tag Database. */
+/*---------------------------------------------------------------------------*/
+
+/* Specifiers for which d_un union member to use */
+
+enum
+{
+ EDYN_UNTYPE_IGNORED,
+ EDYN_UNTYPE_VAL,
+ EDYN_UNTYPE_PTR,
+ EDYN_UNTYPE_UNSPECIFIED
+};
+
+
+/* Specifiers for executable/shared object file requirements */
+
+enum
+{
+ EDYN_TAGREQ_IGNORED,
+ EDYN_TAGREQ_MANDATORY,
+ EDYN_TAGREQ_OPTIONAL,
+ EDYN_TAGREQ_UNSPECIFIED
+};
+
+
+/* Data structure for one dynamic tag database entry */
+
+struct EDYN_TAG
+{
+ const char* d_tag_name; /* tag name string */
+ Elf32_Sword d_tag_value; /* DT_* tag value */
+ Elf32_Word d_untype; /* which d_un union member to use */
+ Elf32_Word d_exec_req; /* requirement for executable files */
+ Elf32_Word d_shared_req; /* requirement for shared object files */
+};
+
+extern const struct EDYN_TAG EDYN_TAG_DB[];
+
+/*****************************************************************************/
+/* Special Section Database */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Names */
+/*---------------------------------------------------------------------------*/
+#define ESCN_BSS_name ".bss"
+#define ESCN_COMMENT_name ".comment"
+#define ESCN_DATA1_name ".data1"
+#define ESCN_DATA_name ".data"
+#define ESCN_DEBUG_name ".debug"
+#define ESCN_DYNAMIC_name ".dynamic"
+#define ESCN_DYNSTR_name ".dynstr"
+#define ESCN_DYNSYM_name ".dynsym"
+#define ESCN_FINI_ARRAY_name ".fini_array"
+#define ESCN_FINI_name ".fini"
+#define ESCN_GOT_name ".got"
+#define ESCN_HASH_name ".hash"
+#define ESCN_INIT_ARRAY_name ".init_array"
+#define ESCN_INIT_name ".init"
+#define ESCN_INTERP_name ".interp"
+#define ESCN_LINE_name ".line"
+#define ESCN_NOTE_name ".note"
+#define ESCN_PLT_name ".plt"
+#define ESCN_PREINIT_ARRAY_name ".preinit_array"
+#define ESCN_RELA_name ".rela"
+#define ESCN_REL_name ".rel"
+#define ESCN_RODATA1_name ".rodata1"
+#define ESCN_RODATA_name ".rodata"
+#define ESCN_SHSTRTAB_name ".shstrtab"
+#define ESCN_STRTAB_name ".strtab"
+#define ESCN_SYMTAB_SHNDX_name ".symtab_shndx"
+#define ESCN_SYMTAB_name ".symtab"
+#define ESCN_TBSS_name ".tbss"
+#define ESCN_TDATA1_name ".tdata1"
+#define ESCN_TDATA_name ".tdata"
+#define ESCN_TEXT_name ".text"
+#define ESCN_ATTRIBUTES_name "__TI_build_attributes"
+#define ESCN_ICODE_name "__TI_ICODE"
+#define ESCN_XREF_name "__TI_XREF"
+
+/*---------------------------------------------------------------------------*/
+/* Special Section Information Data Structure. */
+/*---------------------------------------------------------------------------*/
+struct ESCN
+{
+ const char *name;
+ Elf32_Word sh_type;
+ Elf32_Word sh_entsize;
+ Elf32_Word sh_flags;
+};
+
+extern const struct ESCN ESCN_DB[];
+
+#endif /* ELF32_H */
diff --git a/src/core/dsp/ocl_load/DLOAD/relocate.h b/src/core/dsp/ocl_load/DLOAD/relocate.h
new file mode 100644
index 0000000..ee21aa9
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/relocate.h
@@ -0,0 +1,64 @@
+/*
+* relocate.h
+*
+* Declare names and IDs of all C6x-specific relocation types supported
+* in the dynamic loader.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef RELOCATE_H
+#define RELOCATE_H
+
+#include <inttypes.h>
+#include "elf32.h"
+#include "dload.h"
+#include "dload_api.h"
+
+/*---------------------------------------------------------------------------*/
+/* Declare some globals that are used for internal debugging and profiling. */
+/*---------------------------------------------------------------------------*/
+#if LOADER_DEBUG || LOADER_PROFILE
+#include <time.h>
+extern int DLREL_relocations;
+extern time_t DLREL_total_reloc_time;
+#endif
+
+
+/*---------------------------------------------------------------------------*/
+/* Landing point for core loader's relocation processor. */
+/*---------------------------------------------------------------------------*/
+void DLREL_relocate(DLOAD_HANDLE handle, LOADER_FILE_DESC *fd,
+ DLIMP_Dynamic_Module *dyn_module);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/symtab.h b/src/core/dsp/ocl_load/DLOAD/symtab.h
new file mode 100644
index 0000000..1f06584
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/symtab.h
@@ -0,0 +1,72 @@
+/*
+* symtab.h
+*
+* Specification of functions used by the core loader to create, maintain,
+* and destroy internal symbol tables.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef SYMTAB_H
+#define SYMTAB_H
+
+#include "ArrayList.h"
+#include "dload.h"
+
+/*****************************************************************************/
+/* This is the top-level application file handle. It should only be needed */
+/* under the Linux and DSBT models. */
+/*****************************************************************************/
+extern int32_t DLIMP_application_handle;
+
+/*---------------------------------------------------------------------------*/
+/* Core Loader Symbol Table Management Functions */
+/*---------------------------------------------------------------------------*/
+BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle,
+ int32_t sym_index,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Addr *sym_value);
+
+BOOL DLSYM_global_lookup(DLOAD_HANDLE handle,
+ const char *sym_name,
+ DLIMP_Loaded_Module *pentry,
+ Elf32_Addr *sym_value);
+
+BOOL DLSYM_lookup_local_symtab(const char *sym_name,
+ struct Elf32_Sym *symtab,
+ Elf32_Word symnum,
+ Elf32_Addr *sym_value);
+
+void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/util.h b/src/core/dsp/ocl_load/DLOAD/util.h
new file mode 100644
index 0000000..24c5b3f
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/util.h
@@ -0,0 +1,89 @@
+/*
+* util.h
+*
+* Definition of some useful string comparison routines (not
+* not provided on all platforms) and a few generic macros.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <ctype.h>
+
+#if !defined(__linux)
+
+/*****************************************************************************/
+/* STRCASECMP() - Case-insensitive strcmp. */
+/*****************************************************************************/
+static int strcasecmp(const char* s1, const char* s2)
+{
+ char c1, c2;
+ do { c1 = *s1++; c2 = *s2++; }
+ while (c1 && c2 && (tolower(c1) == tolower(c2)));
+
+ return tolower(c1) - tolower(c2);
+}
+
+/*****************************************************************************/
+/* STRNCASECMP() - Case-insensitive strncmp. */
+/*****************************************************************************/
+static int strncasecmp(const char* s1, const char* s2, size_t n)
+{
+ char c1, c2;
+
+ if (!n) return 0;
+
+ do { c1 = *s1++; c2 = *s2++; }
+ while (--n && c1 && c2 && (tolower(c1) == tolower(c2)));
+
+ return tolower(c1) - tolower(c2);
+}
+
+#endif
+
+/*****************************************************************************/
+/* Define MIN and MAX macros. */
+/*****************************************************************************/
+#define MIN(x,y) (((x) > (y)) ? (y) : (x))
+#define MAX(x,y) (((x) >= (y)) ? (x) : (y))
+
+/*****************************************************************************/
+/* C implementation of 'bool' type. */
+/*****************************************************************************/
+typedef int BOOL;
+#define TRUE 1
+#define FALSE 0
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/version.h b/src/core/dsp/ocl_load/DLOAD/version.h
new file mode 100644
index 0000000..e36d1a9
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/version.h
@@ -0,0 +1,63 @@
+/*
+* version.h
+*
+* Dynamic Loader source version identifictaion.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef _VERSION_H_
+#define _VERSION_H_
+
+/*****************************************************************************/
+/* VERSION NUMBER COMPONENTS - ALWAYS INCREASING!! */
+/* Initial version ID is 1.0.0. Successive version ID's will be incremented */
+/* by automated processes during release port. */
+/*****************************************************************************/
+#define VERSION_MAJOR 1
+#define VERSION_MINOR 0
+#define VERSION_PATCH 0
+
+/******************************************************************************/
+/* Macros used to convert version macros into strings. */
+/******************************************************************************/
+#define MKCSTR(_str) #_str
+#define MKMSTR(_str) MKCSTR(_str)
+
+/******************************************************************************/
+/* VERSION string construction macros. */
+/******************************************************************************/
+#define VERSTR MKMSTR(VERSION_MAJOR) "." MKMSTR(VERSION_MINOR) "." MKMSTR(VERSION_PATCH)
+#define VERSION "Texas Instruments Dynamic Loader API/Core v"VERSTR
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD/virtual_targets.h b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h
new file mode 100644
index 0000000..1d44b4d
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD/virtual_targets.h
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "dload.h"
+#include "elf32.h"
+
+#ifdef C60_TARGET
+#include "c60_dynamic.h"
+#include "c60_reloc.h"
+#endif
+
+#ifdef ARM_TARGET
+#include "arm_dynamic.h"
+#include "arm_reloc.h"
+#endif
+
+/*****************************************************************************/
+/* Define a virtual target class to give access to target specific functions */
+/*****************************************************************************/
+typedef struct vtarget
+{
+ int machine_id;
+
+ BOOL (*relocate_dynamic_tag_info)(DLIMP_Dynamic_Module *dyn_module, int i);
+ BOOL (*process_eiosabi)(DLIMP_Dynamic_Module* dyn_module);
+ BOOL (*process_dynamic_tag)(DLIMP_Dynamic_Module *dyn_module, int i);
+ void (*relocate)(DLOAD_HANDLE handle, LOADER_FILE_DESC *elf_file,
+ DLIMP_Dynamic_Module *dyn_module);
+
+} VIRTUAL_TARGET;
+
+
+
+/*****************************************************************************/
+/* Populate this for each target supported. */
+/*****************************************************************************/
+VIRTUAL_TARGET vt_arr[] = {
+
+#ifdef C60_TARGET
+ {
+ EM_TI_C6000,
+ DLDYN_c60_relocate_dynamic_tag_info,
+ DLDYN_c60_process_eiosabi,
+ DLDYN_c60_process_dynamic_tag,
+ DLREL_c60_relocate
+ },
+#endif
+#ifdef ARM_TARGET
+ {
+ EM_ARM,
+ DLDYN_arm_relocate_dynamic_tag_info,
+ DLDYN_arm_process_eiosabi,
+ DLDYN_arm_process_dynamic_tag,
+ DLREL_arm_relocate
+ },
+#endif
+ {
+ EM_NONE,
+ 0,
+ 0,
+ 0,
+ 0
+ }
+};
+
+
diff --git a/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log
new file mode 100644
index 0000000..689cfe6
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_API/api_version_change.log
@@ -0,0 +1,33 @@
+
+ Dynamic Loader API and Loader Core - Version Number Change Log
+ ==============================================================
+
+ Version Number Description
+ --------------------------
+
+ The version number associated with the Dynamic Loader API and the Loader Core
+ sources has three components:
+
+ <major version>.<minor version>.<patch version>
+
+ major version - is incremented if there is a change to the API that creates a
+ compatibility discontinuity.
+
+ minor version - is incremented if functionality is added to the API without
+ causing a compatibility discontinuity.
+
+ patch version - is incremented if a defect has been repaired, a performance
+ enhancement has been added, or the source code has been
+ refactored in some way. There should not be a compatibility
+ discontinuity created by an increment to the patch version.
+
+ Version Number Change Log
+ -------------------------
+
+ 1.0.0 - 17 July 2009 - Initial release of dynamic loader API and loader
+ core sources.
+
+ 2.0.0 - 1 Feb 2013 - Add client handle to several DLIF functions.
+ - Add DLIF_exit() for loader abort.
+
+
diff --git a/src/core/dsp/ocl_load/DLOAD_API/dload_api.h b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h
new file mode 100644
index 0000000..95de10f
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_API/dload_api.h
@@ -0,0 +1,700 @@
+/*
+* dload_api.h
+*
+* Dynamic Loader API Specification
+* --------------------------------
+*
+* Client-side of API is assumed to be platform dependent, but object file
+* format independent.
+*
+* Core Loader side of API is assumed to be platform independent, but
+* object file format dependent and target dependent.
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef DLOAD_API_H
+#define DLOAD_API_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include "util.h"
+
+extern int debugging_on;
+
+/*****************************************************************************/
+/* Specification of Loader File Descriptor. If client side of the loader */
+/* supports virtual memory, this may need to be updated to facilitate the */
+/* use of mmap(). */
+/*****************************************************************************/
+typedef FILE LOADER_FILE_DESC;
+
+static const int LOADER_SEEK_SET = SEEK_SET;
+static const int LOADER_SEEK_CUR = SEEK_CUR;
+static const int LOADER_SEEK_END = SEEK_END;
+
+/*****************************************************************************/
+/* TARGET_ADDRESS - type suitable for storing target memory address values. */
+/*****************************************************************************/
+typedef uint32_t TARGET_ADDRESS;
+
+/*****************************************************************************/
+/* Define DLOAD Object Handle */
+/*****************************************************************************/
+typedef void * DLOAD_HANDLE;
+
+/*****************************************************************************/
+/* Core Loader Provided API Functions (Core Loader Entry Points) */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_version() */
+/* */
+/* Return a string constant representation for the version ID of the */
+/* dynamic loader's core loader source code. */
+/* */
+/*---------------------------------------------------------------------------*/
+#include "version.h"
+#define DLOAD_version() VERSION
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_create() */
+/* */
+/* Construct and initialize the dynamic loader core's handle. */
+/* */
+/*---------------------------------------------------------------------------*/
+DLOAD_HANDLE DLOAD_create(void * client_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_destroy() */
+/* */
+/* Destroy and finalize the dynamic loader core's handle. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_destroy(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_initialize() */
+/* */
+/* Construct and initialize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_initialize(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_finalize() */
+/* */
+/* Destroy and finalize data structures internal to the dynamic */
+/* loader core. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLOAD_finalize(DLOAD_HANDLE handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load_symbols() */
+/* */
+/* Load externally visible symbols from the specified file so that they */
+/* can be linked against when another object file is subsequntly loaded. */
+/* External symbols will be made available for global symbol linkage. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_load_symbols(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load() */
+/* */
+/* Dynamically load the specified file and return a file handle for the */
+/* loaded file. If the load fails, this function will return a value */
+/* zero (0). */
+/* */
+/* The core loader must have read access to the file pointed by fp. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLOAD_load(DLOAD_HANDLE handle, LOADER_FILE_DESC* fp);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_unload() */
+/* */
+/* Given a file handle ID, unload all object segments associated with */
+/* the identified file and any of its dependents that are not still in */
+/* use. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_unload(DLOAD_HANDLE handle, uint32_t pseudopid);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_names_info() */
+/* */
+/* Given a file handle, return the number entry points that are */
+/* available in the specified file as well as the max name length. This */
+/* can then be used by the client to allocate the appropriate amount of */
+/* memory needed to call DLOAD_get_entry_names() */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_names_info(DLOAD_HANDLE handle, uint32_t file_handle,
+ int32_t *entry_pt_cnt,
+ int32_t *entry_pt_max_name_len);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_names() */
+/* */
+/* Given a file handle, build a list of entry point names that are */
+/* available in the specified file. This can be used when querying */
+/* the list of global functions available in a shared library. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_names(DLOAD_HANDLE handle, uint32_t file_handle,
+ int32_t* entry_pt_cnt, char*** entry_pt_names);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_query_symbol() */
+/* */
+/* Query the value of a symbol that is defined by an object file that */
+/* has previously been loaded. Boolean return value will be false if */
+/* the symbol is not found. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_query_symbol(DLOAD_HANDLE handle, uint32_t file_handle,
+ const char *sym_name, TARGET_ADDRESS *sym_val);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_entry_point() */
+/* */
+/* Given a file handle, return the entry point target address associated */
+/* with that object file. The entry point address value is written to */
+/* *sym_val. The return value of the function indicates whether the */
+/* file with the specified handle was found or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_entry_point(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_load_arguments() */
+/* */
+/* Given a file handle, find the object file assicated with that handle */
+/* and copy the argc/argv information from the client into that object */
+/* file's .args section. The return value indicates whether the operation */
+/* was successful. If there are no loaded object files which match the */
+/* handle or if there is insufficient space in the .args section to hold */
+/* the specified argc/argv information, the function will return false. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_load_arguments(DLOAD_HANDLE handle, uint32_t file_handle,
+ int argc, char** argv);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_prepare_for_execution() */
+/* */
+/* Given a file handle, prepare for execution : */
+/* - Return entry point associated with that module in the *sym_val */
+/* output parameter. */
+/* - Write out the given arguments to the .args section contained in the */
+/* same module. */
+/* - As a test (for the Reference implementation) read the arguments */
+/* using the DLIF_read_arguments() function and set global argc,argv. */
+/* */
+/* The return value of the function indicates whether the file with the */
+/* specified handle was found or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_prepare_for_execution(DLOAD_HANDLE handle, uint32_t file_handle,
+ TARGET_ADDRESS *sym_val,
+ int argc, char** argv);
+
+
+/*****************************************************************************/
+/* Client Provided API Functions */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* File I/O */
+/* */
+/* The client side of the dynamic loader must provide basic file I/O */
+/* capabilities so that the core loader has random access into any */
+/* object file that it is asked to load. */
+/* */
+/* The client side of the dynamic loader must provide a definition of */
+/* the LOADER_FILE_DESC in dload_filedefs.h. This allows the core loader */
+/* to be independent of how the client accesses raw data in an object */
+/* file. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fseek() */
+/* */
+/* Seek to a position in a file (accessed via 'stream') based on the */
+/* values for offset and origin. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_ftell() */
+/* */
+/* Return the current file position in the file identified in the */
+/* LOADER_FILE_DESC pointed to by 'stream'. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_ftell(LOADER_FILE_DESC *stream);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fread() */
+/* */
+/* Read 'size' * 'nmemb' bytes of data from the file identified in the */
+/* LOADER_FILE_DESC object pointed to by 'stream', and write that data */
+/* into the memory accessed via 'ptr'. */
+/* */
+/*---------------------------------------------------------------------------*/
+size_t DLIF_fread(void *ptr, size_t size, size_t nmemb,
+ LOADER_FILE_DESC *stream);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_fclose() */
+/* */
+/* Close a file that was opened on behalf of the core loader. Ownership */
+/* of the file pointer in question belongs to the core loader, but the */
+/* client has exclusive access to the file system. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_fclose(LOADER_FILE_DESC *fd);
+
+/*---------------------------------------------------------------------------*/
+/* Host Memory Management */
+/* */
+/* Allocate and free host memory as needed for the dynamic loader's */
+/* internal data structures. If the dynamic loader resides on the */
+/* target architecture, then this memory is allocated from a target */
+/* memory heap that must be managed separately from memory that is */
+/* allocated for a dynamically loaded object file. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_malloc() */
+/* */
+/* Allocate 'size' bytes of memory space that is usable as scratch space */
+/* (appropriate for the loader's internal data structures) by the dynamic */
+/* loader. */
+/* */
+/* If allocation fails, this function must not return. */
+/* */
+/*---------------------------------------------------------------------------*/
+void* DLIF_malloc(size_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_free() */
+/* */
+/* Free memory space that was previously allocated by DLIF_malloc(). */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_free(void* ptr);
+
+/*---------------------------------------------------------------------------*/
+/* Target Memory Allocator Interface */
+/* */
+/* The client side of the dynamic loader must create and maintain an */
+/* infrastructure to manage target memory. The client must keep track */
+/* of what target memory is associated with each object segment, */
+/* allocating target memory for newly loaded objects and release target */
+/* memory that is associated with objects that are being unloaded from */
+/* the target architecture. */
+/* */
+/* The two client-supplied functions, DLIF_allocate() and DLIF_release(), */
+/* are used by the core loader to interface into the client side's */
+/* target memory allocator infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_SEGMENT_FLAGS - segment characteristics. */
+/*---------------------------------------------------------------------------*/
+typedef uint32_t DLOAD_SEGMENT_FLAGS;
+static const int DLOAD_SF_executable = 0x1; /* Memory must be executable */
+static const int DLOAD_SF_relocatable = 0x2; /* Segment must be relocatable */
+static const int DLOAD_SF_writable = 0x4; /* Memory must be writable */
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_MEMORY_SEGMENT - Define structure to represent placement and size */
+/* details of a segment to be loaded. */
+/*---------------------------------------------------------------------------*/
+struct DLOAD_MEMORY_SEGMENT
+{
+ uint32_t target_page; /* requested/returned memory page */
+ TARGET_ADDRESS target_address; /* requested/returned address */
+ uint32_t objsz_in_bytes; /* size of init'd part of segment */
+ uint32_t memsz_in_bytes; /* size of memory block for segment */
+// DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */
+};
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_MEMORY_REQUEST - Define structure to represent a target memory */
+/* request made by the core loader on behalf of a segment that the */
+/* loader needs to relocate and write into target memory. */
+/*---------------------------------------------------------------------------*/
+struct DLOAD_MEMORY_REQUEST
+{
+ LOADER_FILE_DESC *fp; /* file being loaded */
+ struct DLOAD_MEMORY_SEGMENT *segment; /* obj for req/ret alloc */
+ void *host_address; /* ret hst ptr from DLIF_copy()*/
+ BOOL is_loaded; /* returned as true if segment */
+ /* is already in target memory */
+ uint32_t offset; /* file offset of segment's */
+ /* raw data */
+ uint32_t flip_endian; /* endianness of trg opp host */
+ DLOAD_SEGMENT_FLAGS flags; /* allocation request flags */
+ uint32_t align; /* align of trg memory block */
+};
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_initMem() */
+/* */
+/* Given an address and size, initialize the memory used to load the */
+/* dynamic segments. This should be called by the client before */
+/* beginning dynamic loading. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_initMem(void* client_handle, uint32_t dynMemAddr, uint32_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_deinitMem() */
+/* */
+/* De-initialize the memory used to load the dynamic segments. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_deinitMem(void* client_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_allocate() */
+/* */
+/* Given a DLOAD_MEMORY_REQUEST created by the core loader, allocate */
+/* target memory to fulfill the request using the target memory */
+/* management infrastrucutre on the client side of the dynamic loader. */
+/* The contents of the DLOAD_MEMORY_REQUEST will be updated per the */
+/* details of a successful allocation. The allocated page and address */
+/* can be found in the DLOAD_MEMORY_SEGMENT attached to the request. */
+/* The boolean return value reflects whether the allocation was */
+/* successful or not. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_allocate(void* client_handle, struct DLOAD_MEMORY_REQUEST *req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_release() */
+/* */
+/* Given a DLOAD_MEMORY_SEGMENT description, free the target memory */
+/* associated with the segment using the target memory management */
+/* infrastructure on the client side of the dynamic loader. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_release(void* client_handle, struct DLOAD_MEMORY_SEGMENT* ptr);
+
+/*---------------------------------------------------------------------------*/
+/* Target Memory Access / Write Services */
+/* */
+/* The client side's target memory allocator infrastructure communicates */
+/* with the core loader through the DLOAD_MEMORY_REQUEST and */
+/* DLOAD_MEMORY_SEGMENT data structures defined above. To complete the */
+/* loading of an object segment, the segment may need to be relocated */
+/* before it is actually written to target memory in the space that was */
+/* allocated for it by DLIF_allocate(). */
+/* */
+/* The client side of the dynamic loader provides two functions to help */
+/* complete the process of loading an object segment, DLIF_copy() and */
+/* DLIF_write(). */
+/* */
+/* These functions help to make the core loader truly independent of */
+/* whether it is running on the host or target architecture and how the */
+/* client provides for reading/writing from/to target memory. */
+/* */
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/* DLIF_copy() */
+/* */
+/* Copy segment data from the object file described in the 'fp' and */
+/* 'offset' of the DLOAD_MEMORY_REQUEST into host accessible memory so */
+/* that it can relocated or otherwise manipulated by the core loader. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_write() */
+/* */
+/* Once the segment data described in the DLOAD_MEMORY_REQUEST is ready */
+/* (relocated, if needed), write the segment contents to the target */
+/* memory identified in the DLOAD_MEMORY_SEGMENT attached to the request. */
+/* */
+/* After the segment contents have been written to target memory, the */
+/* core loader should discard the DLOAD_MEMORY_REQUEST object, but retain */
+/* the DLOAD_MEMORY_SEGMENT object so that the target memory associated */
+/* with the segment can be releases when the segment is unloaded. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_write(void* client_handle, struct DLOAD_MEMORY_REQUEST* req);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_read() */
+/* */
+/* Given a host accessible buffer, read content of indicated target */
+/* memory address into the buffer. */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_read(void* client_handle,
+ void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_memcpy() */
+/* */
+/* Given a host accessible buffer, copy content from specified buffer */
+/* into target memory. */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_memcpy(void* client_handle, void *to, void *from, size_t size);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_execute() */
+/* */
+/* Start execution on the target architecture from given 'exec_addr'. */
+/* If the dynamic loader is running on the target architecture, this can */
+/* be effected as a simple function call. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_execute(void* client_handle, TARGET_ADDRESS exec_addr);
+
+/*---------------------------------------------------------------------------*/
+/* Loading and Unloading of Dependent Files */
+/* */
+/* The dynamic loader core loader must coordinate loading and unloading */
+/* dependent object files with the client side of the dynamic loader. */
+/* This allows the client to keep its bookkeeping information up to date */
+/* with what is currently loaded on the target architecture. */
+/* */
+/* For instance, the client may need to interact with a file system or */
+/* registry. The client may also need to update debug information in */
+/* synch with the loading and unloading of shared objects. */
+/* */
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/* DLIF_load_dependent() */
+/* */
+/* Ask client to find and open a dependent file identified by the */
+/* 'so_name' parameter, then, if necessary, initiate a DLOAD_load() */
+/* call to actually load the shared object onto the target. A */
+/* successful load will return a file handle ID that the client can */
+/* associate with the newly loaded file. */
+/* */
+/*---------------------------------------------------------------------------*/
+int DLIF_load_dependent(void* client_handle, const char* so_name);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_unload_dependent() */
+/* */
+/* Ask client to unload a dependent file identified by the 'file_handle' */
+/* parameter. Initiate a call to DLOAD_unload() to actually free up */
+/* the target memory that was occupied by the object file. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_unload_dependent(void* client_handle, uint32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* Error/Warning Registration Functions */
+/* */
+/* The client will maintain an error/warning log. This will allow the */
+/* core loader to register errors and warnings in the load during a */
+/* given dynamic load. The client is required to check the log after */
+/* each load attempt to report any problems. */
+/* */
+/*---------------------------------------------------------------------------*/
+
+
+/*---------------------------------------------------------------------------*/
+/* Loader Warning Types */
+/*---------------------------------------------------------------------------*/
+typedef enum {
+ DLWT_MISC = 0, /* Miscellaneous warning */
+ DLWT_FILE /* Warning missing/invalid file information */
+} LOADER_WARNING_TYPE;
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_warning() */
+/* */
+/* Log a warning message with the client's error/warning handling */
+/* infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* Loader Error Types */
+/*---------------------------------------------------------------------------*/
+typedef enum {
+ DLET_MISC = 0, /* Miscellaneous error */
+ DLET_FILE, /* Error reading/processing file */
+ DLET_SYMBOL, /* Symbol resolution error */
+ DLET_RELOC, /* Relocation error */
+ DLET_MEMORY, /* Host memory allocation/free error */
+ DLET_TRGMEM, /* Target memory allocation/free error */
+ DLET_DEBUG /* Shared object or DLL debug error */
+} LOADER_ERROR_TYPE;
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_error() */
+/* */
+/* Log an error message with the client's error/warning handling */
+/* infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_exit() */
+/* */
+/* Abort the loader following a fatal error. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_exit(int code);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_trace() */
+/* */
+/* Log a message with the client's trace handling infrastructure. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_trace(const char *fmt, ...);
+
+/*---------------------------------------------------------------------------*/
+/* Dynamic Static Base Table (DSBT) Support Functions */
+/*---------------------------------------------------------------------------*/
+#define DSBT_INDEX_INVALID -1
+#define DSBT_DSBT_BASE_INVALID 0
+#define DSBT_STATIC_BASE_INVALID 0
+
+/*****************************************************************************/
+/* Core Loader Side of DSBT Support */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_dsbt_size() */
+/* */
+/* Query the size of the DSBT associated with a specified file. The */
+/* client will check the size of a module's DSBT before it writes a copy */
+/* of the master DSBT to the module's DSBT. If the module's DSBT is not */
+/* big enough, an error will be emitted and the load will fail. */
+/* */
+/*---------------------------------------------------------------------------*/
+uint32_t DLOAD_get_dsbt_size(DLOAD_HANDLE handle, int32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_dsbt_base() */
+/* */
+/* Find DSBT address for specified file. The client will query for this */
+/* address after allocation and symbol relocation has been completed. */
+/* The client will write a copy of the master DSBT to the returned DSBT */
+/* address if the module's DSBT size is big enough. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_dsbt_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *dsbt_base);
+
+/*---------------------------------------------------------------------------*/
+/* DLOAD_get_static_base() */
+/* */
+/* Find static base for a specified file. The client will query for this */
+/* address after allocation and symbol relocation has been completed. */
+/* The client will use the returned static base value to fill the slot */
+/* in the master DSBT that is associated with this module. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLOAD_get_static_base(DLOAD_HANDLE handle, int32_t file_handle,
+ TARGET_ADDRESS *static_base);
+
+
+/*****************************************************************************/
+/* Client Side of DSBT Support */
+/*****************************************************************************/
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_register_dsbt_index_request() */
+/* */
+/* Register a request for a DSBT index with the client. A module can */
+/* make a specific DSBT index request or it can allow the client to */
+/* assign a DSBT index on its behalf (requested_dsbt_index == -1). The */
+/* client implementation of this function must check that a specific DSBT */
+/* index request does not conflict with a previous specific DSBT index */
+/* request. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle,
+ const char *requestor_name,
+ int32_t requestor_file_handle,
+ int32_t requested_dsbt_index);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_assign_dsbt_indices() */
+/* */
+/* Bind each module that registered a request for a DSBT index to a */
+/* specific slot in the DSBT. Specific requests for DSBT indices will be */
+/* honored first. Any general requests that remain will be assigned to */
+/* the first available slot in the DSBT. */
+/* */
+/*---------------------------------------------------------------------------*/
+void DLIF_assign_dsbt_indices(void);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_get_dsbt_index() */
+/* */
+/* Given a module that uses the DSBT model, return the identity of the */
+/* DSBT slot that was assigned to it by the client. This function can */
+/* only be called after the client has assigned DSBT indices to all */
+/* loaded object modules that use the DSBT model. The implementation of */
+/* this function will check that a proper DSBT index has been assigned to */
+/* the specified module and an invalid index (-1) if there is a problem. */
+/* */
+/*---------------------------------------------------------------------------*/
+int32_t DLIF_get_dsbt_index(int32_t file_handle);
+
+/*---------------------------------------------------------------------------*/
+/* DLIF_update_all_dsbts() */
+/* */
+/* Populate the client's model of the master DSBT with the static base */
+/* for each assigned slot in the DSBT, then write a copy of the master */
+/* DSBT to each module's DSBT location. The implementation of this */
+/* function must check the size of each module's DSBT to make sure that */
+/* it is large enough to hold a copy of the master DSBT. The function */
+/* will return FALSE if there is a problem. */
+/* */
+/*---------------------------------------------------------------------------*/
+BOOL DLIF_update_all_dsbts(void);
+
+#endif
diff --git a/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c
new file mode 100644
index 0000000..fbcdbeb
--- /dev/null
+++ b/src/core/dsp/ocl_load/DLOAD_SYM/symtab.c
@@ -0,0 +1,417 @@
+/*
+* symtab.c
+*
+* Symbol table creation, maintenance, and management. This module also
+* contains implementations of local and global symbol table lookup
+* algorithms, as appropriate for the platform that we are running on
+* (assumed to be DSP Bridge or Linux model, indicated by
+* direct_dependent_only flag in a given Module).
+*
+* Copyright (C) 2009-2014 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#include "elf32.h"
+#include "ArrayList.h"
+
+/*---------------------------------------------------------------------------*/
+/* Set up a Queue of Int32 type data objects. */
+/*---------------------------------------------------------------------------*/
+#include "Queue.h"
+TYPE_QUEUE_DEFINITION(int32_t, Int32)
+TYPE_QUEUE_IMPLEMENTATION(int32_t, Int32)
+
+#include "symtab.h"
+#include "dload_api.h"
+#include <string.h>
+
+/*---------------------------------------------------------------------------*/
+/* Holds the handle of the ET_EXEC-type mmodule loaded, if any. */
+/*---------------------------------------------------------------------------*/
+int32_t DLIMP_application_handle = 0;
+
+/*---------------------------------------------------------------------------*/
+/* Function prototypes */
+/*---------------------------------------------------------------------------*/
+BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value);
+
+/*****************************************************************************/
+/* DLSYM_COPY_GLOBALS() - Copy global symbols from the dynamic module's */
+/* symbol table to the loader's global symbol table. */
+/*****************************************************************************/
+void DLSYM_copy_globals(DLIMP_Dynamic_Module *dyn_module)
+{
+ Elf32_Word i, global_index, global_symnum;
+ DLIMP_Loaded_Module *module = dyn_module->loaded_module;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_copy_globals:\n");
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* The dynamic symbol table is sorted so that the local symbols come */
+ /* before the global symbols. gsymtab_offset points to the address where */
+ /* the first global symbol starts. Only the global symbols need to be */
+ /* copied into the persistent info. */
+ /*------------------------------------------------------------------------*/
+ global_index = dyn_module->gsymtab_offset / sizeof(struct Elf32_Sym);
+ global_symnum = dyn_module->symnum - global_index;
+
+ /*------------------------------------------------------------------------*/
+ /* Create space for the new global symbol table. */
+ /*------------------------------------------------------------------------*/
+ if (module->gsymtab)
+ {
+ DLIF_free(module->gsymtab);
+ module->gsymtab = NULL;
+ }
+
+ if (global_symnum > 0)
+ {
+ module->gsymtab = DLIF_malloc(sizeof(struct Elf32_Sym) * global_symnum);
+
+ memcpy(module->gsymtab,
+ &dyn_module->symtab[global_index],
+ sizeof(struct Elf32_Sym) * global_symnum);
+ }
+ module->gsymnum = global_symnum;
+
+ /*------------------------------------------------------------------------*/
+ /* Copy the string table part that contains the global symbol names. */
+ /*------------------------------------------------------------------------*/
+ if (module->gstrtab)
+ {
+ DLIF_free(module->gstrtab);
+ module->gstrtab = NULL;
+ }
+
+ module->gstrsz = dyn_module->strsz - dyn_module->gstrtab_offset;
+ if (module->gstrsz)
+ {
+ module->gstrtab = DLIF_malloc(module->gstrsz);
+
+ memcpy(module->gstrtab,
+ dyn_module->strtab + dyn_module->gstrtab_offset,
+ module->gstrsz);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* Update the symbol names of the global symbol entries to point to */
+ /* the symbol names in the string table. */
+ /* NOTE: Note that we don't set the offset into the string table. We */
+ /* instead set the full address so that the st_name field can be accessed */
+ /* as char *. */
+ /*------------------------------------------------------------------------*/
+ for (i = 0; i < global_symnum; i++)
+ {
+
+ Elf32_Word old_offset = dyn_module->symtab[i + global_index].st_name -
+ (Elf32_Addr) dyn_module->strtab;
+ Elf32_Word new_offset = old_offset - dyn_module->gstrtab_offset;
+ struct Elf32_Sym *sym = &((struct Elf32_Sym*)(module->gsymtab))[i];
+ sym->st_name = new_offset + (Elf32_Addr)module->gstrtab;
+
+#if LOADER_DEBUG
+ if (debugging_on) DLIF_trace("Copying symbol: %s\n",
+ (char*)dyn_module->symtab[i + global_index].st_name);
+#endif
+ }
+}
+
+/*****************************************************************************/
+/* BREADTH_FIRST_LOOKUP() - Perform a breadth-first search of the Module */
+/* dependency graph to find specified symbol name (sym_name). */
+/*****************************************************************************/
+static BOOL breadth_first_lookup(DLOAD_HANDLE phandle,
+ const char* sym_name,
+ int handle,
+ Elf32_Addr *sym_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* We start this function by putting the specified file handle on the */
+ /* file_handle_queue. */
+ /*------------------------------------------------------------------------*/
+ LOADER_OBJECT *dHandle = (LOADER_OBJECT *)phandle;
+ Int32_Queue file_handle_queue = TYPE_QUEUE_INITIALIZER;
+ Int32_enqueue(&file_handle_queue, handle);
+
+ /*------------------------------------------------------------------------*/
+ /* While the queue is not empty, keep looking for the symbol. */
+ /*------------------------------------------------------------------------*/
+ while(file_handle_queue.size)
+ {
+ int i;
+
+ /*---------------------------------------------------------------------*/
+ /* Set up a pointer to front of the list of loaded files so that we */
+ /* can be sure that dependent files will be searched in load order. */
+ /*---------------------------------------------------------------------*/
+ loaded_module_ptr_Queue_Node* mod_node =
+ dHandle->DLIMP_loaded_objects.front_ptr;
+ int* dependencies = (int*)(mod_node->value->dependencies.buf);
+
+ /*---------------------------------------------------------------------*/
+ /* Pluck off the file handle at the front of the file_handle_queue. */
+ /* We will search this file next. */
+ /*---------------------------------------------------------------------*/
+ handle = Int32_dequeue(&file_handle_queue);
+
+ /*---------------------------------------------------------------------*/
+ /* Locate the Module associated with the current file handle. */
+ /*---------------------------------------------------------------------*/
+ while (mod_node->value->file_handle != handle) mod_node++;
+
+ /*---------------------------------------------------------------------*/
+ /* Search the symbol table of the current file handle's Module. */
+ /* If the symbol was found, then we're finished. */
+ /*---------------------------------------------------------------------*/
+ if (DLSYM_lookup_global_symtab(sym_name,
+ mod_node->value->gsymtab,
+ mod_node->value->gsymnum,
+ sym_value))
+ return TRUE;
+
+ /*---------------------------------------------------------------------*/
+ /* If our symbol was not in the current Module, then add this Module's */
+ /* dependents to the end of the file_handle_queue. */
+ /*---------------------------------------------------------------------*/
+ for (i = 0; i < mod_node->value->dependencies.size; i++)
+ Int32_enqueue(&file_handle_queue, dependencies[i]);
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* We didn't find our symbol; return FALSE. */
+ /*------------------------------------------------------------------------*/
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_global_lookup() - Search the global symbol table to find the */
+/* definition of the given symbol name. */
+/*****************************************************************************/
+BOOL DLSYM_global_lookup(DLOAD_HANDLE handle,
+ const char *sym_name,
+ DLIMP_Loaded_Module *loaded_module,
+ Elf32_Addr *sym_value)
+{
+ int i = 0;
+ loaded_module_ptr_Queue_Node* node;
+ LOADER_OBJECT *dHandle = (LOADER_OBJECT *)handle;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_global_lookup: %s\n", sym_name);
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* We will choose a different lookup algorithm based on what kind of */
+ /* platform we are supporting. In the Braveheart case, the global symbol */
+ /* lookup algorithm searches the base image first, followed by the */
+ /* explicit children of the specified Module. */
+ /*------------------------------------------------------------------------*/
+ if (loaded_module->direct_dependent_only)
+ {
+ int* child_handle = (int*)(loaded_module->dependencies.buf);
+
+ /*---------------------------------------------------------------------*/
+ /* Spin through list of this Module's dependencies (anything on its */
+ /* DT_NEEDED list), searching through each dependent's symbol table */
+ /* to find the symbol we are after. */
+ /*---------------------------------------------------------------------*/
+ for (i = 0; i < loaded_module->dependencies.size; i++)
+ {
+ for (node = dHandle->DLIMP_loaded_objects.front_ptr;
+ node->value->file_handle != child_handle[i];
+ node=node->next_ptr);
+
+ /*------------------------------------------------------------------*/
+ /* Return true if we find the symbol. */
+ /*------------------------------------------------------------------*/
+ if (DLSYM_lookup_global_symtab(sym_name,
+ node->value->gsymtab,
+ node->value->gsymnum,
+ sym_value))
+ return TRUE;
+ }
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* In the LINUX model, we will use a breadth-first global symbol lookup */
+ /* algorithm. First, the application's global symbol table is searched, */
+ /* followed by its children, followed by their children, and so on. */
+ /* It is up to the client of this module to set the application handle. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ if (breadth_first_lookup(handle, sym_name, DLIMP_application_handle,
+ sym_value))
+ return TRUE;
+ }
+
+ /*------------------------------------------------------------------------*/
+ /* If we got this far, then symbol was not found. */
+ /*------------------------------------------------------------------------*/
+ DLIF_error(DLET_SYMBOL, "Could not resolve symbol %s!\n", sym_name);
+
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_symtab() - Lookup the symbol name in the given symbol table. */
+/* Symbol must have specified binding. Return the */
+/* value in sym_value and return TRUE if the lookup */
+/* succeeds. */
+/*****************************************************************************/
+static BOOL DLSYM_lookup_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value,
+ BOOL require_local_binding)
+{
+ Elf32_Addr sym_idx;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_lookup_symtab, sym to find : %s\n", sym_name);
+#endif
+
+ for (sym_idx = 0; sym_idx < symnum; sym_idx++)
+ {
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("\tPotential symbol match : %s\n",
+ (char*)symtab[sym_idx].st_name);
+#endif
+
+ if ((symtab[sym_idx].st_shndx != SHN_UNDEF) && ((require_local_binding &&
+ (ELF32_ST_BIND(symtab[sym_idx].st_info) == STB_LOCAL)) ||
+ (!require_local_binding &&
+ (ELF32_ST_BIND(symtab[sym_idx].st_info) != STB_LOCAL))) &&
+ !strcmp(sym_name,(char*)(symtab[sym_idx].st_name)))
+ {
+ if (sym_value) *sym_value = symtab[sym_idx].st_value;
+ return TRUE;
+ }
+ }
+ if (sym_value) *sym_value = 0;
+ return FALSE;
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_global_symtab() - Lookup the symbol name in the given symbol */
+/* table. Symbol must have global binding. */
+/* Return the value in sym_value and return */
+/* TRUE if the lookup succeeds. */
+/*****************************************************************************/
+BOOL DLSYM_lookup_global_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value)
+{
+ return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, FALSE);
+}
+
+/*****************************************************************************/
+/* DLSYM_lookup_local_symtab() - Lookup the symbol name in the given symbol */
+/* table. Symbol must have local binding. */
+/* Return the value in sym_value and return */
+/* TRUE if the lookup succeeds. */
+/*****************************************************************************/
+BOOL DLSYM_lookup_local_symtab(const char *sym_name, struct Elf32_Sym *symtab,
+ Elf32_Word symnum, Elf32_Addr *sym_value)
+{
+ return DLSYM_lookup_symtab(sym_name, symtab, symnum, sym_value, TRUE);
+}
+
+/*****************************************************************************/
+/* CANONICAL_SYMBOL_LOOKUP() - Find the symbol definition. Look up the local */
+/* symbol table to find the symbol. If it is a */
+/* definition and cannot be pre-empted, return */
+/* it. Otherwise, do a look up in the global */
+/* symbol table that contains the symbol tables */
+/* from all the necessary modules. */
+/*****************************************************************************/
+BOOL DLSYM_canonical_lookup(DLOAD_HANDLE handle, int sym_index,
+ DLIMP_Dynamic_Module *dyn_module,
+ Elf32_Addr *sym_value)
+{
+ /*------------------------------------------------------------------------*/
+ /* Lookup the symbol table to get the symbol characteristics. */
+ /*------------------------------------------------------------------------*/
+ struct Elf32_Sym *sym = &dyn_module->symtab[sym_index];
+ int32_t st_bind = ELF32_ST_BIND(sym->st_info);
+ int32_t st_vis = ELF32_ST_VISIBILITY(sym->st_other);
+ BOOL is_def = (sym->st_shndx != SHN_UNDEF &&
+ (sym->st_shndx < SHN_LORESERVE ||
+ sym->st_shndx == SHN_ABS ||
+ sym->st_shndx == SHN_COMMON ||
+ sym->st_shndx == SHN_XINDEX));
+ const char *sym_name = (char *)sym->st_name;
+
+#if LOADER_DEBUG
+ if (debugging_on)
+ DLIF_trace("DLSYM_canonical_lookup: %d, %s\n", sym_index, sym_name);
+#endif
+
+ /*------------------------------------------------------------------------*/
+ /* Local symbols and symbol definitions that cannot be pre-empted */
+ /* are resolved by the definition in the same module. */
+ /*------------------------------------------------------------------------*/
+ if (st_bind == STB_LOCAL || st_vis != STV_DEFAULT)
+ {
+ /*---------------------------------------------------------------------*/
+ /* If it is a local symbol or non-local that cannot be preempted, */
+ /* the definition should be found in the same module. If we don't */
+ /* find the definition it is an error. */
+ /*---------------------------------------------------------------------*/
+ if (!is_def)
+ {
+ DLIF_error(DLET_SYMBOL,
+ "Local/non-imported symbol %s definition is not found "
+ "in module %s!\n", sym_name, dyn_module->name);
+ return FALSE;
+ }
+ else
+ {
+ if (sym_value) *sym_value = sym->st_value;
+ return TRUE;
+ }
+ }
+ /*------------------------------------------------------------------------*/
+ /* Else we have either pre-emptable defintion or undef symbol. We need */
+ /* to do global look up. */
+ /*------------------------------------------------------------------------*/
+ else
+ {
+ return DLSYM_global_lookup(handle, sym_name, dyn_module->loaded_module,
+ sym_value);
+ }
+}
+
diff --git a/src/core/dsp/ocl_load/README b/src/core/dsp/ocl_load/README
new file mode 100644
index 0000000..19165f6
--- /dev/null
+++ b/src/core/dsp/ocl_load/README
@@ -0,0 +1,8 @@
+
+This program is dependent on these Standard CVS modules
+
+C60_DLOAD_DYN:
+C60_DLOAD_REL:
+DLOAD:
+DLOAD_API:
+DLOAD_SYM:
diff --git a/src/core/dsp/ocl_load/Stack.h b/src/core/dsp/ocl_load/Stack.h
new file mode 100644
index 0000000..e958674
--- /dev/null
+++ b/src/core/dsp/ocl_load/Stack.h
@@ -0,0 +1,182 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/*
+* Stack.h
+*
+* Interface to Stack
+* ------------------
+*
+* This is an implementation of a type-independent stack implemented as
+* a signly linked list class for C. It's basically a template class, but
+* uses macros instead, so that it can be compiled with a C-only compiler.
+*
+* To define a Stack class:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+*
+* In a separate C file:
+* #include "Stack.h"
+* TYPE_STACK_DEFINITION(object_type,Class_Identifier)
+* TYPE_STACK_IMPLEMENTATION(object_type,Class_Identifier)
+*
+* Now, to create a stack:
+* struct Class_Identifier_Stack name;
+* Get it initialized to zero everywhere somehow, maybe like this:
+* initialize_stack_Class_Identifier(&name);
+*
+* To add to the stack:
+* push_Class_Identifier(&name, object);
+*
+* To access the top of the stack:
+* Class_Identifier_Stack_Node *tos = name.top_ptr;
+* do_something_to_(tos->value);
+*
+* To delete from the stack:
+* if (name.size > 0) pop_Class_Identifier(&name);
+*
+* Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* Neither the name of Texas Instruments Incorporated nor the names of
+* its contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+#ifndef STACK_H
+#define STACK_H
+
+#include <inttypes.h>
+#include "dload_api.h"
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define structure specifications for a last-in, */
+/* first-out linked list of t_name objects. */
+/*****************************************************************************/
+#define TYPE_STACK_DEFINITION(t, t_name) \
+struct t_name##_Stack_Node_ \
+{ \
+ t value; \
+ struct t_name##_Stack_Node_* next_ptr; \
+}; \
+typedef struct t_name##_Stack_Node_ t_name##_Stack_Node; \
+ \
+typedef struct \
+{ \
+ t_name##_Stack_Node* top_ptr; \
+ t_name##_Stack_Node* bottom_ptr; \
+ int size; \
+} t_name##_Stack; \
+ \
+extern void t_name##_initialize_stack(t_name##_Stack* stack); \
+extern void t_name##_push(t_name##_Stack* stack, t to_push); \
+extern t t_name##_pop(t_name##_Stack* stack);
+
+/*****************************************************************************/
+/* TYPE_STACK_DEFINITION() - Define the initializer to initalize Stacks. */
+/*****************************************************************************/
+#define TYPE_STACK_INITIALIZER {NULL, NULL, 0 }
+
+/*****************************************************************************/
+/* TYPE_STACK_IMPLEMENTATION() - Define member functions of new LIFO linked */
+/* list "class" of t_name objects. */
+/* */
+/* <type>_initialize_stack() - clears the stack */
+/* <type>_push() - pushes a <t> type object to the top of the stack */
+/* <type>_pop() - pop a <t> type object from the top of the stack */
+/* and provide access to it to the caller */
+/*****************************************************************************/
+#define TYPE_STACK_IMPLEMENTATION(t, t_name) \
+void t_name##_initialize_stack (t_name##_Stack* stack) \
+{ \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ stack->size = 0; \
+} \
+void t_name##_push(t_name##_Stack* stack, t to_push) \
+{ \
+ stack->size++; \
+ \
+ if(!stack->top_ptr) \
+ { \
+ stack->bottom_ptr = stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = NULL; \
+ } \
+ else \
+ { \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr; \
+ stack->top_ptr = \
+ (t_name##_Stack_Node*)(DLIF_malloc(sizeof(t_name##_Stack_Node))); \
+ stack->top_ptr->next_ptr = next_ptr; \
+ } \
+ \
+ stack->top_ptr->value = to_push; \
+} \
+ \
+t t_name##_pop(t_name##_Stack* stack) \
+{ \
+ t to_ret; \
+ t_name##_Stack_Node* next_ptr = stack->top_ptr->next_ptr; \
+ \
+ stack->size--; \
+ to_ret = stack->top_ptr->value; \
+ DLIF_free((void*)(stack->top_ptr)); \
+ \
+ if(!stack->size) \
+ stack->top_ptr = stack->bottom_ptr = NULL; \
+ else \
+ stack->top_ptr = next_ptr; \
+ \
+ return to_ret; \
+}
+
+#endif
diff --git a/src/core/dsp/ocl_load/ocl_load.c b/src/core/dsp/ocl_load/ocl_load.c
new file mode 100644
index 0000000..c53a137
--- /dev/null
+++ b/src/core/dsp/ocl_load/ocl_load.c
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "dload_api.h"
+
+#define TYPE_STACK_DEFINITION(t, t_name)
+#define TYPE_STACK_IMPLEMENTATION(t, t_name)
+
+int debugging_on = FALSE;
+int profiling_on = FALSE;
+
+int global_argc;
+char **global_argv;
+
+int DLIF_fseek(LOADER_FILE_DESC *stream, int32_t offset, int origin)
+ { return fseek(stream, offset, origin); }
+
+
+size_t DLIF_fread(void *ptr, size_t size, size_t nmemb,
+ LOADER_FILE_DESC *stream)
+ { return fread(ptr, size, nmemb, stream); }
+
+int32_t DLIF_ftell (LOADER_FILE_DESC *stream) { return ftell(stream); }
+int32_t DLIF_fclose(LOADER_FILE_DESC *fd) { return fclose(fd); }
+void* DLIF_malloc(size_t size) { return malloc(size); }
+void DLIF_free (void* ptr) { free(ptr); }
+
+/*****************************************************************************/
+/* DLIF_COPY() - Copy data from file to host-accessible memory. */
+/* Returns a host pointer to the data in the host_address field of the */
+/* DLOAD_MEMORY_REQUEST object. */
+/*****************************************************************************/
+BOOL DLIF_copy(void* client_handle, struct DLOAD_MEMORY_REQUEST* targ_req)
+{
+ struct DLOAD_MEMORY_SEGMENT* obj_desc = targ_req->segment;
+ LOADER_FILE_DESC* f = targ_req->fp;
+ void *buf = calloc(obj_desc->memsz_in_bytes, 1);
+
+ fseek(f, targ_req->offset, SEEK_SET);
+
+ int result = 1;
+ if (obj_desc->objsz_in_bytes)
+ result = fread(buf, obj_desc->objsz_in_bytes, 1, f);
+
+ assert(result == 1);
+
+ targ_req->host_address = buf;
+
+ return 1;
+}
+
+BOOL DLIF_read(void* client_handle,
+ void *ptr, size_t size, size_t nmemb, TARGET_ADDRESS src)
+ { assert(0); }
+
+BOOL DLIF_memcpy(void* client_handle,
+ void *to, void *from, size_t size)
+ { return (!memcpy(to, from, size)) ? 0 : 1; }
+
+int32_t DLIF_execute(void* client_handle,
+ TARGET_ADDRESS exec_addr) { assert(0); return 1; }
+
+
+
+
+BOOL DLIF_register_dsbt_index_request(DLOAD_HANDLE handle,
+ const char *requestor_name,
+ int32_t requestor_file_handle,
+ int32_t requested_dsbt_index)
+ { assert(0); }
+
+void DLIF_assign_dsbt_indices(void) { assert(0); }
+
+int32_t DLIF_get_dsbt_index(int32_t file_handle)
+ { assert(0); return DSBT_INDEX_INVALID; }
+
+BOOL DLIF_update_all_dsbts() { assert(0); return TRUE; }
+
+void DLIF_warning(LOADER_WARNING_TYPE wtype, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ printf("<< D L O A D >> WARNING: ");
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_error(LOADER_ERROR_TYPE etype, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ printf("<< D L O A D >> ERROR: ");
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_trace(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+
+void DLIF_exit(ecode)
+{
+ exit(ecode);
+}
+
diff --git a/src/core/dsp/program.cpp b/src/core/dsp/program.cpp
new file mode 100644
index 0000000..6495ec9
--- /dev/null
+++ b/src/core/dsp/program.cpp
@@ -0,0 +1,633 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "program.h"
+#include "device.h"
+#include "kernel.h"
+
+#include "../program.h"
+
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include "wga.h"
+
+#include <llvm/LinkAllPasses.h>
+#include <WorkitemHandlerChooser.h>
+#include <BreakConstantGEPs.h>
+#include <Flatten.h>
+#include <PHIsToAllocas.h>
+#include <IsolateRegions.h>
+#include <VariableUniformityAnalysis.h>
+#include <ImplicitLoopBarriers.h>
+#include <LoopBarriers.h>
+#include <BarrierTailReplication.h>
+#include <CanonicalizeBarriers.h>
+#include <WorkItemAliasAnalysis.h>
+#include <WorkitemReplication.h>
+#include <WorkitemLoops.h>
+#include <AllocasToEntry.h>
+#include <Workgroup.h>
+#include <TargetAddressSpaces.h>
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <elf.h>
+
+#include "genfile_cache.h"
+
+genfile_cache * genfile_cache::pInstance = 0;
+
+timespec getTime()
+{
+ struct timespec tp;
+ if (clock_gettime(CLOCK_MONOTONIC, &tp) != 0)
+ clock_gettime(CLOCK_REALTIME, &tp);
+ return tp;
+}
+
+double ts_to_double(const timespec &t)
+ { return ((double)t.tv_nsec) /1000000000.0 + (double)t.tv_sec; }
+
+double tsdiff (const timespec& start, const timespec& end)
+ { return ts_to_double(end) - ts_to_double(start); }
+
+
+using namespace Coal;
+
+DSPProgram::DSPProgram(DSPDevice *device, Program *program)
+: DeviceProgram(), p_device(device), p_program(program), p_program_handle(-1), p_loaded(false), p_keep_files(false),
+ p_cache_kernels(true)
+{
+ char *keep = getenv("TI_OCL_KEEP_FILES");
+ if (keep) p_keep_files = true;
+
+ char *cache = getenv("TI_OCL_CACHE_KERNELS_OFF");
+ if (cache) p_cache_kernels = false;
+}
+
+DSPProgram::~DSPProgram()
+{
+ p_device->unload(p_program_handle);
+ if (!p_keep_files && !p_cache_kernels) unlink(p_outfile);
+}
+
+DSPProgram::segment_list *segments;
+
+bool DSPProgram::load()
+{
+ segments = &p_segments_written;
+
+ p_program_handle = p_device->load(p_outfile);
+ if (!p_program_handle) return false;
+
+ segments = NULL;
+ p_loaded = true;
+
+ char *debug_kernel = getenv("TI_OCL_DEBUG_KERNEL");
+
+ /*-------------------------------------------------------------------------
+ * ensure that the newly populated areas are not stale in device caches
+ *------------------------------------------------------------------------*/
+ Msg_t msg;
+ int segNum = p_segments_written.size();
+
+ assert(segNum <= MAX_FLUSH_BUF_SIZE/2);
+
+ msg.command = CACHEINV;
+ msg.u.k.flush.numBuffers = segNum;
+ msg.u.k.flush.num_mpaxs = 0;
+ for (int i=0; i < segNum; ++i)
+ {
+ msg.u.k.flush.buffers[2*i] = p_segments_written[i].ptr;
+ msg.u.k.flush.buffers[2*i+1] = p_segments_written[i].size;
+
+ uint32_t flags = p_segments_written[i].flags &
+ (DLOAD_SF_executable | DLOAD_SF_writable);
+
+ const char *seg_desc;
+ switch (flags)
+ {
+ case 0: seg_desc = "Read Only"; break;
+ case DLOAD_SF_executable: seg_desc = "Executable"; break;
+ case DLOAD_SF_writable: seg_desc = "Writable"; break;
+ default: seg_desc = "Writable & Executable"; break;
+ }
+
+ if (debug_kernel)
+ printf("%s segment loaded to 0x%08x with size 0x%x\n",
+ seg_desc, p_segments_written[i].ptr, p_segments_written[i].size);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Send the command and wait for the ready response.
+ *------------------------------------------------------------------------*/
+ p_device->mail_to(msg);
+
+ /*-------------------------------------------------------------------------
+ * We do not wait here. The wait will be handled by the standard wait loop
+ * int the worker thread.
+ *------------------------------------------------------------------------*/
+ return true;
+}
+
+bool DSPProgram::is_loaded() const
+{
+ return p_loaded;
+}
+
+bool DSPProgram::linkStdLib() const
+{
+ return false;
+}
+
+const char* DSPProgram::outfile_name() const
+{
+ return p_outfile;
+}
+
+DSPDevicePtr DSPProgram::data_page_ptr()
+{
+ DSPDevicePtr p;
+
+ if (!is_loaded()) load();
+
+ DLOAD_get_static_base(p_device->dload_handle(), p_program_handle, &p);
+ return p;
+}
+
+void DSPProgram::createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier)
+{
+ if (hasBarrier)
+ {
+ manager->add(new llvm::DominatorTree());
+ manager->add(new pocl::WorkitemHandlerChooser());
+ manager->add(new BreakConstantGEPs()); // from pocl
+ // add(new GenerateHeader()); // no need
+ manager->add(new pocl::Flatten());
+ manager->add( llvm::createAlwaysInlinerPass());
+ manager->add( llvm::createGlobalDCEPass());
+ manager->add( llvm::createCFGSimplificationPass());
+ manager->add( llvm::createLoopSimplifyPass());
+ manager->add(new pocl::PHIsToAllocas());
+ manager->add( llvm::createRegionInfoPass());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::VariableUniformityAnalysis()); // TODO
+ manager->add(new pocl::ImplicitLoopBarriers());
+ manager->add(new pocl::LoopBarriers());
+ manager->add(new pocl::BarrierTailReplication());
+ manager->add(new pocl::CanonicalizeBarriers());
+ manager->add(new pocl::IsolateRegions());
+ manager->add(new pocl::WorkItemAliasAnalysis());
+ // add(new pocl::WorkitemReplication()); // no need
+ manager->add(new pocl::WorkitemLoops());
+ manager->add(new pocl::AllocasToEntry());
+ // add(new pocl::Workgroup()); // no need
+ manager->add(new pocl::TargetAddressSpaces());
+ }
+
+ if (optimize)
+ {
+ /*
+ * Inspired by code from "The LLVM Compiler Infrastructure"
+ */
+ manager->add(llvm::createDeadArgEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createFunctionInliningPass());
+ manager->add(llvm::createPruneEHPass()); // Remove dead EH info.
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createGlobalDCEPass()); // Remove dead functions.
+ manager->add(llvm::createArgumentPromotionPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+
+ //ASW TODO maybe turn off re: pete. might gen bad xlator input
+ //manager->add(llvm::createScalarReplAggregatesPass());
+
+ manager->add(llvm::createFunctionAttrsPass()); // Add nocapture.
+ manager->add(llvm::createGlobalsModRefPass()); // IP alias analysis.
+ manager->add(llvm::createLICMPass()); // Hoist loop invariants.
+ manager->add(llvm::createGVNPass()); // Remove redundancies.
+ manager->add(llvm::createMemCpyOptPass()); // Remove dead memcpys.
+ manager->add(llvm::createDeadStoreEliminationPass());
+ manager->add(llvm::createInstructionCombiningPass());
+ manager->add(llvm::createJumpThreadingPass());
+ manager->add(llvm::createCFGSimplificationPass());
+ }
+
+ manager->add(llvm::createUnifyFunctionExitNodesPass());
+ manager->add(llvm::createTIOpenclWorkGroupAggregationPass(hasBarrier));
+
+ /*-------------------------------------------------------------------------
+ * Borrow the pocl alloca hoister for the TI simplistic WGA pass as well
+ *------------------------------------------------------------------------*/
+ if (!hasBarrier)
+ manager->add(new pocl::AllocasToEntry());
+}
+
+
+std::string process_cl6x_options(std::string options)
+{
+ std::istringstream options_stream(options);
+ std::string token;
+ std::string result;
+
+ while (options_stream >> token)
+ {
+ if ((token.find(".obj") != std::string::npos) ||
+ (token.find(".dll") != std::string::npos) ||
+ (token.find(".ae66") != std::string::npos) ||
+ (token.find(".a66") != std::string::npos) ||
+ (token.find(".out") != std::string::npos) ||
+ (token.find(".lib") != std::string::npos) ||
+ (token.find(".o") != std::string::npos) ||
+ (token.find(".o66") != std::string::npos) ||
+ (token.find(".oe66") != std::string::npos) ||
+ (token.find(".a") != std::string::npos) ||
+ (token.find(".cmd") != std::string::npos))
+ result += token + " ";
+ }
+ return result;
+}
+
+/******************************************************************************
+* Find the C6000 CGT installation
+******************************************************************************/
+char *get_cgt_install()
+{
+ char *install = getenv("TI_OCL_CGT_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_CGT_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the C6000 compiler tools are installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+/******************************************************************************
+* Find the OpenCL installation
+******************************************************************************/
+char *get_ocl_install()
+{
+ char *install = getenv("TI_OCL_INSTALL");
+ if (!install)
+ {
+ std::cout <<
+ "The environment variable TI_OCL_INSTALL must be set to a "
+ << std::endl <<
+ "directory path where the TI OpenCL product is installed. "
+ << std::endl;
+
+ abort();
+ }
+
+ return install;
+}
+
+std::string get_ocl_dsp()
+{
+ static std::string sinstall;
+
+ if (sinstall.empty())
+ {
+ struct stat st;
+ const char *stdpath = "/usr/share/ti/opencl/dsp";
+ if (stat(stdpath, &st) == 0)
+ sinstall = string(stdpath);
+ else sinstall = string(get_ocl_install()) + "/dsp";
+ }
+
+ return sinstall;
+}
+
+/******************************************************************************
+* run_cl6x
+******************************************************************************/
+static int run_cl6x(char *filename, std::string *llvm_bitcode,
+ bool keep_files, std::string options)
+{
+ std::string command("cl6x --f -q --abi=eabi --use_g3 -mv6600 -mt -mo "
+ "-ft=/tmp -fs=/tmp -fr=/tmp ");
+
+ if (keep_files) command += "-mw -k --z ";
+
+ /*-------------------------------------------------------------------------
+ * Turned off for now to workaround a timing bug. Plan to re-enable later
+ *------------------------------------------------------------------------*/
+ command += "--disable:sploop ";
+
+ char *cl6x_debug = getenv("TI_OCL_CL6X_DEBUG");
+
+ if (cl6x_debug) command += "-g -o0 ";
+ else command += "-o3 ";
+
+ char *no_sp = getenv("TI_OCL_SOFTWARE_PIPELINE_OFF");
+ if (no_sp) command += "-mu ";
+
+ char *cgt_install = get_cgt_install();
+
+ command += "-I"; command += cgt_install; command += "/include ";
+ command += "-I"; command += cgt_install; command += "/lib ";
+ command += "-I"; command += get_ocl_dsp().c_str(); command += " ";
+
+ command += "--bc_file="; command += filename; command += " ";
+
+ /*-------------------------------------------------------------------------
+ * Encode LLVM bitcode as bytes in the .llvmir section of the .asm file
+ *------------------------------------------------------------------------*/
+ if (llvm_bitcode != NULL)
+ {
+ char bitasm_name[32];
+ strcpy(bitasm_name, filename);
+ strcat(bitasm_name, "_bc.asm");
+ std::ofstream outasmfile(bitasm_name, std::ios::out);
+ outasmfile << "\t.sect \".llvmir\"\n" << "\t.retain";
+ int nbytes = llvm_bitcode->size();
+ for (int i = 0; i < nbytes; i++)
+ if (i % 10 == 0)
+ outasmfile << "\n\t.byte " << (int) llvm_bitcode->at(i);
+ else
+ outasmfile << ", " << (int) llvm_bitcode->at(i);
+ outasmfile.close();
+
+ command += bitasm_name; command += " ";
+ }
+
+ command += "-z -ldsp.syms -o ";
+ command += filename; command += ".out ";
+
+ if (keep_files)
+ { command += "-m "; command += filename; command += ".map "; }
+
+ /*-------------------------------------------------------------------------
+ * Any libraries or object files need to go last to resolve references
+ *------------------------------------------------------------------------*/
+ command += process_cl6x_options(options);
+
+ //timespec t0, t1;
+ //clock_gettime(CLOCK_MONOTONIC, &t0);
+ int x = system(command.c_str());
+ //clock_gettime(CLOCK_MONOTONIC, &t1);
+ //printf("cl6x time: %6.4f secs\n",
+ // (float)t1.tv_sec-t0.tv_sec+(t1.tv_nsec-t0.tv_nsec)/1e9);
+
+ if (!cl6x_debug)
+ {
+ std::string strip_command("strip6x ");
+ strip_command += filename; strip_command += ".out";
+ x = system(strip_command.c_str());
+ }
+}
+
+/**
+ * Extract llvm bitcode and native binary from MixedBinary
+ */
+bool DSPProgram::ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native)
+{
+ if (binary_str == NULL) return false;
+ if (strncmp(&binary_str->at(0), ELFMAG, SELFMAG) != 0) return false;
+
+ /*-------------------------------------------------------------------------
+ * Parse ELF file format, extract ".llvmir" section into bitcode
+ * Valid Assumptions: 1. cl6x only creates 32-bit ELF files (for now)
+ * 2. cl6x ELF file has the same endianness as the host
+ *------------------------------------------------------------------------*/
+ if (bitcode != NULL)
+ {
+ Elf32_Ehdr ehdr; /* memcpy into here to guarantee proper alignment */
+ memcpy(&ehdr, & binary_str->at(0), sizeof(Elf32_Ehdr));
+ int n_sects = ehdr.e_shnum;
+ int shoff = ehdr.e_shoff;
+ int shstr_sect = ehdr.e_shstrndx;
+
+ Elf32_Shdr shdr; /* memcpy into here to guarantee proper alignment */
+ int shsize = sizeof(Elf32_Shdr);
+ memcpy(&shdr, & binary_str->at(shoff + shstr_sect * shsize), shsize);
+ char *strtab = & binary_str->at(shdr.sh_offset);
+
+ int i;
+ for (i = 0; i < n_sects; i++)
+ {
+ if (i == shstr_sect) continue;
+ memcpy(&shdr, & binary_str->at(shoff + i * shsize), shsize);
+ if (strcmp(&strtab[shdr.sh_name], ".llvmir") == 0) break;
+ }
+ if (i >= n_sects) return false;
+
+ bitcode->clear();
+ bitcode->append(& binary_str->at(shdr.sh_offset), shdr.sh_size);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Return the c6x ELF file in binary_str as native binary
+ *------------------------------------------------------------------------*/
+ if (native != NULL)
+ {
+ native->clear();
+ native->append(*binary_str);
+ }
+
+ return true;
+}
+
+
+/**
+ * Write native binary into file, create tmporary filename in p_outfile
+ */
+void DSPProgram::WriteNativeOut(std::string *native)
+{
+ try
+ {
+ char name_out[] = "/tmp/openclXXXXXX";
+ int fOutfile = mkstemp(name_out);
+ strcpy(p_outfile, name_out);
+ strcat(p_outfile, ".out");
+
+ std::ofstream outfile(p_outfile, std::ios::out | std::ios::binary);
+ outfile.write(native->data(), native->size());
+ outfile.close();
+ close(fOutfile);
+ }
+ catch(...) { std::cout << "ERROR: Binary write out failure" << std::endl; }
+}
+
+/**
+ * Native binary is stored in file, filename in p_outfile
+ * Input: binary_str contains only the bitcode
+ * Output: binary_str contains c6x ELF file with bitcode in ".llvmir" section
+ */
+void DSPProgram::ReadEmbeddedBinary(std::string *binary_str)
+{
+ if (binary_str == NULL) return;
+
+ int length;
+ char *buffer = NULL;
+
+ try
+ {
+ std::ifstream is;
+ is.open(p_outfile, std::ios::binary);
+ is.seekg(0, std::ios::end);
+ length = is.tellg();
+ is.seekg(0, std::ios::beg);
+ buffer = new char[length];
+ is.read(buffer, length);
+ is.close();
+
+ binary_str->clear();
+ binary_str->append(buffer, length);
+ delete [] buffer;
+ }
+ catch(...) { std::cout << "ERROR: Binary read in failure" << std::endl; }
+}
+
+bool DSPProgram::build(llvm::Module *module, std::string *binary_str)
+{
+ p_module = module;
+
+ /*------------------------------------------------------------------------
+ * The input binary_str could be any of the following:
+ * 1. Mixed C6x binary embedded with LLVM bitcode, extract C6x native
+ * binary and return. There is no need to rebuild from LLVM module.
+ * 2. LLVM bitcode, proceed to the regular build:
+ * 2.1 return a corresponding cached c6x binary, if found
+ * 2.2 invoke c6x compiler toolchain, embed LLVM bitcode, build
+ * In either case, put c6x binary in binary_str when return
+ *------------------------------------------------------------------------*/
+ std::string native;
+ if (ExtractMixedBinary(binary_str, NULL, &native))
+ {
+ WriteNativeOut(&native);
+ return true;
+ }
+
+ if (p_cache_kernels)
+ {
+ string cached_outfile = genfile_cache::instance()->lookup
+ (p_module, p_program->deviceDependentCompilerOptions(p_device));
+
+ if (!cached_outfile.empty())
+ {
+ strcpy(p_outfile, cached_outfile.c_str());
+ ReadEmbeddedBinary(binary_str);
+ return true;
+ }
+ }
+
+ char name_template[] = "/tmp/openclXXXXXX";
+ int pFile = mkstemp(name_template);
+
+ strcpy(p_outfile, name_template);
+ strcat(p_outfile, ".out");
+
+ if (pFile != -1)
+ {
+ if (p_keep_files)
+ {
+ //write out the source as well
+
+ std::string filename(name_template);
+ filename += ".cl";
+ std::ofstream out(filename.c_str());
+ out << p_program->source();
+ out.close();
+ }
+
+ llvm::raw_fd_ostream ostream(pFile, false);
+ llvm::WriteBitcodeToFile(p_module, ostream);
+ ostream.flush();
+
+ run_cl6x(name_template, binary_str, p_keep_files,
+ p_program->deviceDependentCompilerOptions(p_device));
+
+ if (!p_keep_files)
+ {
+ unlink(name_template);
+
+ char objfile[32];
+ strcpy(objfile, name_template);
+ strcat(objfile, ".obj");
+ unlink(objfile);
+
+ if (binary_str != NULL)
+ {
+ strcpy(objfile, name_template);
+ strcat(objfile, "_bc.asm");
+ unlink(objfile);
+
+ strcpy(objfile, name_template);
+ strcat(objfile, "_bc.obj");
+ unlink(objfile);
+ }
+ }
+
+ if (p_cache_kernels)
+ genfile_cache::instance()->remember(p_outfile, p_module,
+ p_program->deviceDependentCompilerOptions(p_device));
+
+ ReadEmbeddedBinary(binary_str);
+ }
+
+ if (pFile != -1) close(pFile);
+
+ return true;
+}
+
+DSPDevicePtr DSPProgram::query_symbol(const char *symname)
+{
+ DSPDevicePtr addr;
+
+ bool found = DLOAD_query_symbol(p_device->dload_handle(), p_program_handle,
+ symname, &addr);
+
+ return (found) ? addr : 0;
+}
+
diff --git a/src/core/dsp/program.h b/src/core/dsp/program.h
new file mode 100644
index 0000000..63c1858
--- /dev/null
+++ b/src/core/dsp/program.h
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __DSP_PROGRAM_H__
+#define __DSP_PROGRAM_H__
+
+#include "device.h"
+#include "../deviceinterface.h"
+#include <vector>
+
+namespace llvm
+{
+ class ExecutionEngine;
+ class Module;
+}
+
+namespace Coal
+{
+
+class DSPDevice;
+class Program;
+
+class DSPProgram : public DeviceProgram
+{
+ public:
+ struct seg_desc
+ {
+ seg_desc(DSPDevicePtr p, int s, uint32_t f) :
+ ptr(p), size(s), flags(f) {}
+ DSPDevicePtr ptr;
+ unsigned size;
+ uint32_t flags;
+ };
+
+ typedef std::vector<seg_desc> segment_list;
+
+ public:
+ DSPProgram(DSPDevice *device, Program *program);
+ ~DSPProgram();
+
+ bool linkStdLib() const;
+ const char* outfile_name() const;
+ void createOptimizationPasses(llvm::PassManager *manager,
+ bool optimize, bool hasBarrier=false);
+ bool build(llvm::Module *module, std::string *binary_str);
+ bool ExtractMixedBinary(std::string *binary_str,
+ std::string *bitcode, std::string *native);
+ void WriteNativeOut(std::string *native);
+ void ReadEmbeddedBinary(std::string *binary_str);
+
+ DSPDevicePtr query_symbol(const char *symname);
+ DSPDevicePtr data_page_ptr();
+ bool load();
+ bool is_loaded() const;
+
+ private:
+ DSPDevice *p_device;
+ Program *p_program;
+ llvm::Module *p_module;
+ int p_program_handle;
+ char p_outfile[32];
+ bool p_loaded;
+ segment_list p_segments_written;
+ bool p_keep_files;
+ bool p_cache_kernels;
+};
+}
+#endif
diff --git a/src/core/dsp/shmem.cpp b/src/core/dsp/shmem.cpp
new file mode 100644
index 0000000..6aec2f8
--- /dev/null
+++ b/src/core/dsp/shmem.cpp
@@ -0,0 +1,539 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "shmem.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <ti/cmem.h>
+
+#define REPORT(x) printf(x "\n")
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+/******************************************************************************
+* shmem::shmem
+******************************************************************************/
+shmem::shmem()
+ : p_dsp_addr(0), p_size(0), p_page_size(sysconf(_SC_PAGE_SIZE)), p_mmap_fd(-1)
+ , p_mpm_transport_handle(NULL)
+
+{ }
+
+/******************************************************************************
+* shmem::~shmem
+******************************************************************************/
+shmem::~shmem()
+{
+ if (p_mmap_fd != -1) close(p_mmap_fd);
+}
+
+/******************************************************************************
+* shmem::configure
+******************************************************************************/
+void shmem::configure_base(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ /*-------------------------------------------------------------------------
+ * If the sysconf for the page size failed
+ *------------------------------------------------------------------------*/
+ if (p_page_size <= 0) { REPORT("Failed to get PAGE_SIZE"); return; }
+
+ // p_mmap_fd = open("/dev/mem", (O_RDWR | O_SYNC));
+ // Now we use mpm_transport_{open, mmap, munmap, close}
+ /*-------------------------------------------------------------------------
+ * core1-core7's l2 go through /dev/dsp{1-7}
+ * everything else (core0's l2, msmc, global addr) go through /dev/dsp0
+ *------------------------------------------------------------------------*/
+ char devname[16];
+ strcpy(devname, "dsp0");
+ if (0x11800000 <= dsp_addr & dsp_addr < 0x17900000)
+ devname[3] = ((dsp_addr >> 24) - 0x10) + '0';
+ mpm_transport_open_t mpm_transport_open_cfg;
+ mpm_transport_open_cfg.open_mode = (O_SYNC|O_RDWR);
+ p_mpm_transport_handle = mpm_transport_open(devname,
+ &mpm_transport_open_cfg);
+
+ /*-------------------------------------------------------------------------
+ * If the open failed
+ *------------------------------------------------------------------------*/
+ // if (p_mmap_fd == -1) { REPORT("Failed to open /dev/mem"); return; }
+ if (p_mpm_transport_handle == NULL)
+ {
+ printf("Failed to open /dev/%s", devname);
+ return;
+ }
+
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+}
+
+
+/******************************************************************************
+* shmem_persistent::shmem
+******************************************************************************/
+#define MULTIPLE_OF_POW2(x, y) (((x) & ((y)-1)) != 0 ? false : true)
+
+shmem_persistent::shmem_persistent()
+ : p_host_addr(0), p_xlate_dsp_to_host_offset(0)
+{ }
+
+/******************************************************************************
+* shmem_persistent::configure
+******************************************************************************/
+void shmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ configure_base(dsp_addr, size);
+
+ /*-------------------------------------------------------------------------
+ * if base class failed to construct, because /dev/mem could not be opened
+ *------------------------------------------------------------------------*/
+ // if (p_mmap_fd == -1) return;
+ if (p_mpm_transport_handle == NULL) return;
+
+ if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return;
+ }
+
+ if (!MULTIPLE_OF_POW2(size, p_page_size))
+ {
+ REPORT("Mapped region size is not a multiple of page size");
+ return;
+ }
+
+ //p_host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED, p_mmap_fd,
+ // (off_t)dsp_addr);
+ mpm_transport_mmap_t mpm_transport_mmap_cfg;
+ mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE);
+ mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED;
+
+ p_host_addr = (void *)mpm_transport_mmap(p_mpm_transport_handle,
+ dsp_addr, size,
+ &mpm_transport_mmap_cfg);
+
+ // if (p_host_addr == MAP_FAILED)
+ if (p_host_addr == (void *) -1)
+ {
+ REPORT("Failed to mmap");
+ p_host_addr = 0;
+ return;
+ }
+
+ p_xlate_dsp_to_host_offset = (void*)((int64_t)p_host_addr - dsp_addr);
+}
+
+/******************************************************************************
+* shmem_persistent::~shmem_persistent
+******************************************************************************/
+shmem_persistent::~shmem_persistent()
+{
+ // if (p_host_addr) munmap(p_host_addr, p_size);
+ if (p_host_addr)
+ mpm_transport_munmap(p_mpm_transport_handle, p_host_addr, p_size);
+}
+
+/******************************************************************************
+* shmem_persistent::map
+******************************************************************************/
+void *shmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!p_host_addr) return 0;
+
+ if (dsp_addr >= p_dsp_addr && dsp_addr + size <= p_dsp_addr + p_size)
+ return dsp_addr + (char*)p_xlate_dsp_to_host_offset;
+ else
+ {
+ REPORT("Attempting to map a region outside a defined area");
+ return 0;
+ }
+}
+
+/******************************************************************************
+* shmem_persistent::unmap
+******************************************************************************/
+void shmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ // if (host_addr) msync(host_addr, size, MS_SYNC);
+}
+
+
+
+/******************************************************************************
+* shmem_ondemand::shmem_ondemap
+******************************************************************************/
+shmem_ondemand::shmem_ondemand()
+{ }
+
+/******************************************************************************
+* shmem::~shmem
+******************************************************************************/
+shmem_ondemand::~shmem_ondemand()
+{
+}
+
+/******************************************************************************
+* shmem_ondemand::configure
+******************************************************************************/
+void shmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ configure_base(dsp_addr, size);
+}
+
+
+/******************************************************************************
+* shmem_ondemand::map
+******************************************************************************/
+void *shmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!MULTIPLE_OF_POW2(dsp_addr, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return 0;
+ }
+
+ if (!MULTIPLE_OF_POW2(size, p_page_size))
+ {
+ REPORT("Mapped region addr is not a multiple of page size");
+ return 0;
+ }
+
+ if (dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size)
+ {
+ REPORT("Attempting to map a region outside a defined area");
+ return 0;
+ }
+
+ //void *host_addr = mmap(0, size, (PROT_READ|PROT_WRITE), MAP_SHARED,
+ // p_mmap_fd, (off_t)dsp_addr);
+ mpm_transport_mmap_t mpm_transport_mmap_cfg;
+ mpm_transport_mmap_cfg.mmap_prot = (PROT_READ|PROT_WRITE);
+ mpm_transport_mmap_cfg.mmap_flags = MAP_SHARED;
+
+ void * host_addr = mpm_transport_mmap(p_mpm_transport_handle,
+ dsp_addr, size,
+ &mpm_transport_mmap_cfg);
+
+ // if (host_addr == MAP_FAILED)
+ if (host_addr == (void *) -1)
+ {
+ REPORT("Failed to mmap");
+ return 0;
+ }
+
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_ondemand::unmap
+******************************************************************************/
+void shmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ // if (host_addr) munmap(host_addr, size);
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::shmem
+******************************************************************************/
+shmem_cmem_persistent::shmem_cmem_persistent(int cmem_block)
+ : p_host_addr(0), p_xlate_dsp_to_host_offset(0), p_cmem_block(cmem_block)
+{ }
+
+/******************************************************************************
+* shmem_cmem_persistent::init
+* TODO: remove addr3, size3 once uboot is updated, so that we don't have
+* have fragemented CMEM blocks for DDR
+******************************************************************************/
+void shmem_cmem_persistent::cmem_init(DSPDevicePtr64 *addr1, uint64_t *size1,
+ DSPDevicePtr *addr2, uint32_t *size2,
+ DSPDevicePtr64 *addr3, uint64_t *size3)
+{
+ /*-------------------------------------------------------------------------
+ * Assume this is the only use of CMEM, so we reset everything
+ *------------------------------------------------------------------------*/
+#if 0
+ const char *cmem_command = "modprobe -r cmemk; modprobe cmemk "
+ "phys_start=0xa2000000 phys_end=0x100000000 pools=1x1577058304 "
+ "phys_start_1=0x0c000000 phys_end_1=0x0c500000 pools_1=1x5242880 "
+ "allowOverlap=1";
+
+ int result = system(cmem_command);
+#endif
+
+ const char *cmem_command = "For available CMEM DDR block size: ~1.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0x880000000 pools=1x1560281088 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+ const char *cmem_command2 = "For available CMEM DDR block size: ~3.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0x900000000 pools=1x3707764736 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+ const char *cmem_command3 = "For available CMEM DDR block size: ~7.5GB:\n"
+ "modprobe cmemk "
+ "phys_start=0x823000000 phys_end=0xA00000000 pools=1x8002732032 "
+ "phys_start_1=0x0C040000 phys_end_1=0x0C500000 "
+ "allowOverlap=1";
+
+ /*-------------------------------------------------------------------------
+ * First initialize the CMEM module
+ *------------------------------------------------------------------------*/
+ if (CMEM_init() == -1)
+ {
+ printf("\nThe cmemk kernel module does not appear to installed.\n\n"
+ "Commands such as the following run as root would "
+ "install cmemk\n"
+ "and allow OpenCL to proceed properly. The actual memory "
+ "address values for\n"
+ "your system may differ.\n\n");
+ printf("%s\n\n", cmem_command);
+ printf("%s\n\n", cmem_command2);
+ printf("%s\n\n", cmem_command3);
+ exit(-1);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Debug to see in cmem init was correct
+ *------------------------------------------------------------------------*/
+ int num_Blocks = 0;
+ CMEM_getNumBlocks(&num_Blocks);
+ if (num_Blocks < 2)
+ {
+ printf("\nOpenCL needs at least two CMEM blocks to operate properly.\n"
+ "One for DDR, the other for MSMC. Example commands:\n");
+ printf("%s\n\n", cmem_command);
+ printf("%s\n\n", cmem_command2);
+ printf("%s\n\n", cmem_command3);
+ exit(-1);
+ }
+
+ CMEM_BlockAttrs pattrs0 = {0, 0};
+ CMEM_BlockAttrs pattrs1 = {0, 0};
+ CMEM_BlockAttrs pattrs2 = {0, 0};
+
+ CMEM_getBlockAttrs(0, &pattrs0);
+ CMEM_getBlockAttrs(1, &pattrs1);
+ if (num_Blocks > 2)
+ CMEM_getBlockAttrs(2, &pattrs2);
+
+ /*-------------------------------------------------------------------------
+ * Return 36-bit addr, and up to 7.5G memory size
+ *------------------------------------------------------------------------*/
+ *addr1 = (DSPDevicePtr64) pattrs0.phys_base;
+ *size1 = (uint64_t) pattrs0.size;
+ // Persistent CMEM should start within 0x8:2200_0000 - 0x8:4000_0000
+ if (*addr1 >= MPAX_USER_MAPPED_DSP_ADDR)
+ {
+ printf("Unable to allocate OCL persistent CMem from 0x%llx\n",
+ pattrs0.phys_base);
+ exit(EXIT_FAILURE);
+ }
+
+ *addr2 = pattrs1.phys_base;
+ *size2 = pattrs1.size;
+ if (*addr2 < MSMC_OCL_START_ADDR || *addr2 >= MSMC_OCL_END_ADDR)
+ {
+ printf("Unable to allocate OCL MSMC memory from 0x%llx\n",
+ pattrs1.phys_base);
+ exit(EXIT_FAILURE);
+ }
+
+ /*-------------------------------------------------------------------------
+ * Grab all available CMEM physical address, to be managed by OCL
+ *------------------------------------------------------------------------*/
+ DSPDevicePtr64 alloc_dsp_addr = 0;
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_POOL;
+ alloc_dsp_addr = CMEM_allocPoolPhys2(0, 0, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr1)
+ {
+ printf("Failed to allocate 0x%llx from CMem 0, allocated=0x%llx\n",
+ *size1, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+
+ params.type = CMEM_HEAP;
+ alloc_dsp_addr = CMEM_allocPhys2(1, *size2, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr2)
+ {
+ printf("Failed to allocate 0x%x from CMem 1, allocated=0x%llx\n",
+ *size2, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+
+ if (num_Blocks > 2)
+ {
+ *addr3 = pattrs2.phys_base;
+ *size3 = pattrs2.size;
+ params.type = CMEM_POOL;
+ alloc_dsp_addr = CMEM_allocPoolPhys2(2, 0, &params);
+ if (!alloc_dsp_addr || alloc_dsp_addr != *addr3)
+ {
+ printf("Failed to allocate 0x%llx from CMem 2, allocated=0x%llx\n",
+ *size3, alloc_dsp_addr);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ *addr3 = 0;
+ *size3 = 0;
+ }
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::cmem_exit
+******************************************************************************/
+void shmem_cmem_persistent::cmem_exit()
+{
+ /* Finalize the CMEM module */
+ if (CMEM_exit() == -1) ERR(1, "Failed to finalize CMEM");
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::configure
+******************************************************************************/
+void shmem_cmem_persistent::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+ DSPDevicePtr64 cmem_addr = p_dsp_addr;
+ if (p_dsp_addr >= 0xA0000000 && p_dsp_addr < 0xFFFFFFFF)
+ cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL;
+ p_host_addr = CMEM_map(cmem_addr, size);
+ if (! p_host_addr)
+ ERR(1, "Cannot map CMEM physical memory into the Host virtual address space.\n"
+ " This is typically due to Linux system memory being near capacity.");
+ p_xlate_dsp_to_host_offset = (int64_t)p_host_addr - dsp_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::~shmem_cmem_persistent
+******************************************************************************/
+shmem_cmem_persistent::~shmem_cmem_persistent()
+{
+ if (p_dsp_addr == 0) return;
+
+ if (p_host_addr != NULL) CMEM_unmap(p_host_addr, p_size);
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ DSPDevicePtr64 cmem_addr = p_dsp_addr;
+ if (p_dsp_addr > 0xA0000000 && p_dsp_addr < 0xFFFFFFFF)
+ cmem_addr = p_dsp_addr - 0xA0000000 + 0x820000000ULL;
+ CMEM_freePhys(cmem_addr, &params);
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::map: dsp_addr (phys) -> host_addr (virt)
+******************************************************************************/
+void *shmem_cmem_persistent::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ if (!p_host_addr ||
+ dsp_addr < p_dsp_addr || dsp_addr + size > p_dsp_addr + p_size)
+ {
+ ERR(1, "Attempting to cmem_map a region outside a defined area");
+ return NULL;
+ }
+
+ void *host_addr = dsp_addr + (char*)p_xlate_dsp_to_host_offset;
+ if (is_read) CMEM_cacheInv(host_addr, size);
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::unmap: flush host side writes
+******************************************************************************/
+void shmem_cmem_persistent::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ if (host_addr && is_write) CMEM_cacheWb(host_addr, size);
+}
+
+
+/******************************************************************************
+* shmem_cmem_ondeman::configure
+******************************************************************************/
+void shmem_cmem_ondemand::configure(DSPDevicePtr64 dsp_addr, uint64_t size)
+{
+ p_dsp_addr = dsp_addr;
+ p_size = size;
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::map: dsp_addr (phys) -> host_addr (virt)
+******************************************************************************/
+void *shmem_cmem_ondemand::map(DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read)
+{
+ void *host_addr = CMEM_map(dsp_addr, size);
+ if (! host_addr) ERR(1, "Failed to map CMEM address (ondemand)");
+ if (is_read) CMEM_cacheInv(host_addr, size);
+ return host_addr;
+}
+
+/******************************************************************************
+* shmem_cmem_persistent::unmap: flush host side writes
+******************************************************************************/
+void shmem_cmem_ondemand::unmap(void* host_addr, uint32_t size, bool is_write)
+{
+ if (host_addr && is_write) CMEM_cacheWb(host_addr, size);
+ if (host_addr) CMEM_unmap(host_addr, size);
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::malloc: allocate CMEM physical address
+* 64-bit size: could be allocating a buffer, then accessing smaller subbuffers
+******************************************************************************/
+DSPDevicePtr64 shmem_cmem_ondemand::cmem_malloc(uint64_t size)
+{
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_HEAP;
+ DSPDevicePtr64 addr = CMEM_allocPhys2(0, size, &params);
+ if (!addr)
+ {
+ printf("Failed to allocate space 0x%llx from CMem\n", size);
+ exit(EXIT_FAILURE);
+ }
+ return addr;
+}
+
+/******************************************************************************
+* shmem_cmem_ondemand::free: free allocated CMEM physical address
+******************************************************************************/
+void shmem_cmem_ondemand::cmem_free(DSPDevicePtr64 addr)
+{
+ CMEM_AllocParams params = CMEM_DEFAULTPARAMS;
+ params.flags = CMEM_CACHED;
+ params.type = CMEM_HEAP;
+ CMEM_freePhys(addr, &params);
+}
+
diff --git a/src/core/dsp/shmem.h b/src/core/dsp/shmem.h
new file mode 100644
index 0000000..03504a0
--- /dev/null
+++ b/src/core/dsp/shmem.h
@@ -0,0 +1,134 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <stdint.h>
+#ifndef _SHMEM_H
+#define _SHMEM_H
+
+extern "C"
+{
+ #include <mpm_transport.h>
+}
+#include "dspmem.h"
+
+/*=============================================================================
+* Abstract class for Shared memory
+*============================================================================*/
+class shmem
+{
+ public:
+ shmem ();
+ virtual ~shmem ();
+ virtual void configure_base(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void configure (DSPDevicePtr64 dsp_addr, uint64_t size) = 0;
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size,
+ bool is_read=false) = 0;
+ virtual void unmap (void* host_addr, uint32_t size,
+ bool is_write=false) = 0;
+ uint32_t page_size ();
+ DSPDevicePtr64 start () { return p_dsp_addr; }
+ int64_t size () { return p_size; }
+
+ protected:
+ DSPDevicePtr64 p_dsp_addr;
+ int64_t p_size;
+ uint32_t p_page_size;
+ int32_t p_mmap_fd;
+ mpm_transport_h p_mpm_transport_handle;
+
+};
+
+/*=============================================================================
+* Peristent implementation of shmem
+*============================================================================*/
+class shmem_persistent : public shmem
+{
+ public:
+ shmem_persistent ();
+ ~shmem_persistent ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ private:
+ void * p_host_addr;
+ void * p_xlate_dsp_to_host_offset;
+};
+
+/*=============================================================================
+* On Demand implementation of shmem
+*============================================================================*/
+class shmem_ondemand : public shmem
+{
+ public:
+ shmem_ondemand ();
+ ~shmem_ondemand ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+};
+
+/*=============================================================================
+* Peristent implementation of shmem using CMem
+*============================================================================*/
+class shmem_cmem_persistent : public shmem
+{
+ public:
+ shmem_cmem_persistent (int cmem_block);
+ ~shmem_cmem_persistent ();
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ static void cmem_init(DSPDevicePtr64* addr1, uint64_t* size1,
+ DSPDevicePtr* addr2, uint32_t* size2,
+ DSPDevicePtr64* addr3, uint64_t* size3);
+ static void cmem_exit();
+
+ private:
+ void * p_host_addr;
+ int64_t p_xlate_dsp_to_host_offset;
+ int p_cmem_block;
+};
+
+/*=============================================================================
+* Ondemand implementation of shmem using CMem
+*============================================================================*/
+class shmem_cmem_ondemand : public shmem
+{
+ public:
+ shmem_cmem_ondemand () {}
+ ~shmem_cmem_ondemand () {}
+ void configure(DSPDevicePtr64 dsp_addr, uint64_t size);
+ virtual void *map (DSPDevicePtr64 dsp_addr, uint32_t size, bool is_read=false);
+ virtual void unmap (void* host_addr, uint32_t size, bool is_write=false);
+
+ static DSPDevicePtr64 cmem_malloc(uint64_t size);
+ static void cmem_free (DSPDevicePtr64 addr);
+};
+
+#endif // _SHMEM_H
diff --git a/src/core/dsp/source_cache.h b/src/core/dsp/source_cache.h
new file mode 100644
index 0000000..66b4400
--- /dev/null
+++ b/src/core/dsp/source_cache.h
@@ -0,0 +1,114 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _source_cache_
+#define _source_cache_
+
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/crc.hpp>
+
+#include <sys/stat.h>
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <stdint.h>
+#include "u_locks_pthread.h"
+#include "database.h"
+
+class source_cache
+{
+ public:
+ void remember(std::string source)
+ {
+ uint32_t hash = get_crc(source);
+ std::string query("insert into programs(hash, source) values("
+ + boost::lexical_cast<std::string>(hash)
+ + ", \""
+ + source
+ + "\");");
+
+ p_database.query(query.c_str());
+ }
+
+ /*-------------------------------------------------------------------------
+ * Thread safe instance function for singleton behavior
+ *------------------------------------------------------------------------*/
+ static source_cache* instance ()
+ {
+ static Mutex Cache_instance_mutex;
+ source_cache* tmp = pInstance;
+
+ __sync_synchronize();
+
+ if (tmp == 0)
+ {
+ ScopedLock lck(Cache_instance_mutex);
+
+ tmp = pInstance;
+ if (tmp == 0)
+ {
+ char *user = getenv("USER");
+ tmp = new source_cache("/tmp/opencl_source_" + string(user));
+ __sync_synchronize();
+ pInstance = tmp;
+ }
+ }
+ return tmp;
+ }
+
+
+ private:
+ static source_cache* pInstance;
+ std::string p_dbname;
+ Database p_database;
+
+ private:
+ source_cache(std::string db_name) : p_dbname(db_name), p_database(db_name.c_str())
+ {
+ p_database.query("create table if not exists "
+ "programs(hash integer, source string);");
+ }
+
+ uint32_t get_crc(std::string& my_string)
+ {
+ boost::crc_32_type result;
+ result.process_bytes(my_string.data(), my_string.length());
+ return result.checksum();
+ }
+
+ source_cache(const source_cache&); // copy ctor disallowed
+ source_cache& operator=(const source_cache&); // assignment disallowed
+};
+
+#endif // _source_cache_
+
+
diff --git a/src/core/dsp/u_concurrent_map.h b/src/core/dsp/u_concurrent_map.h
new file mode 100644
index 0000000..014c0b6
--- /dev/null
+++ b/src/core/dsp/u_concurrent_map.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_concurrent_map.h
+* @brief TI implementation class that implements a thread safe map.
+*
+******************************************************************************/
+#ifndef _U_CONCURRENT_MAP_H_
+#define _U_CONCURRENT_MAP_H_
+
+#include <iostream>
+#include <map>
+#include "u_lockable.h"
+
+/**************************************************************************//**
+* @class concurrent_map
+*
+* @brief A thread safe map implementation
+*
+* @details This implementation wraps a standard stl map with some locking
+* capability to make the member functions mutually exclusive
+* regions. In derives from the class Lockable which defines a type
+* Lock that can be used to define a type in a scope. The result will
+* be that the remainder of the scope (or until unlock is called) is a
+* mutex.
+*
+******************************************************************************/
+template<typename I, typename T>
+class concurrent_map : public Lockable
+{
+public:
+ concurrent_map() : M(), num_elements(0) {}
+ ~concurrent_map() {}
+
+ /**********************************************************************//**
+ * @brief Place an object in the map.
+ * @param data is the item to psh on the map
+ ***************************************************************************/
+ void push(I index, T const data)
+ {
+ Lock lock(this);
+ M[index] = data;
+ num_elements++;
+ }
+
+ /**********************************************************************//**
+ * @brief How many elements are in the map.
+ * @returns The number of elements in the map.
+ ***************************************************************************/
+ int size() const
+ {
+ Lock lock(this);
+ return num_elements;
+ }
+
+ /**********************************************************************//**
+ * @brief Determine if the map is empty.
+ * @returns true if the map is empty, otherwise false.
+ ***************************************************************************/
+ bool empty() const
+ {
+ Lock lock(this);
+ return (num_elements == 0);
+ }
+
+ /**********************************************************************//**
+ * @brief Attempt to pop an item off the map.
+ * @param popped_value is an output parameter that contains the object popped
+ * if the map is successfully popped.
+ * @returns true if a value is popped, otherwise false
+ ***************************************************************************/
+ bool try_pop(I idx, T& popped_value)
+ {
+ Lock lock(this);
+ if (num_elements == 0) return false;
+
+ typename std::map<I,T>::iterator it = M.find(idx);
+
+ if (it != M.end())
+ {
+ popped_value = it->second;
+ M.erase (it);
+ num_elements--;
+ return true;
+ }
+
+ return false;
+ }
+
+ void dump()
+ {
+ for (typename std::map<I,T>::const_iterator i = M.begin(); i != M.end(); ++i)
+ std::cout << i->first << " ==> " << i->second << std::endl;
+ }
+
+ /*-------------------------------------------------------------------------
+ * The class's data
+ *------------------------------------------------------------------------*/
+private:
+ std::map<I,T> M; //!< standard stl map
+ int num_elements;
+
+ /*-------------------------------------------------------------------------
+ * Prevent copy construction and assignment
+ *------------------------------------------------------------------------*/
+private:
+ concurrent_map(const concurrent_map&);
+ concurrent_map& operator=(const concurrent_map&);
+};
+
+#endif //_U_CONCURRENT_MAP_H_
diff --git a/src/core/dsp/u_concurrent_stack.h b/src/core/dsp/u_concurrent_stack.h
new file mode 100644
index 0000000..6e9755b
--- /dev/null
+++ b/src/core/dsp/u_concurrent_stack.h
@@ -0,0 +1,124 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_concurrent_stack.h
+* @brief TI implementation class that implements a thread safe stack.
+*
+******************************************************************************/
+#ifndef _U_CONCURRENT_STACK_H_
+#define _U_CONCURRENT_STACK_H_
+
+#include <iostream>
+#include <stack>
+#include "u_lockable.h"
+
+/**************************************************************************//**
+* @class concurrent_stack
+*
+* @brief A thread safe stack implementation
+*
+* @details This implementation wraps a standard stl stack with some locking
+* capability to make the member functions mutually exclusive
+* regions. In derives from the class Lockable which defines a type
+* Lock that can be used to define a type in a scope. The result will
+* be that the remainder of the scope (or until unlock is called) is a
+* mutex.
+*
+******************************************************************************/
+template<typename T>
+class concurrent_stack : public Lockable
+{
+public:
+ concurrent_stack() : S(), num_elements(0) {}
+ ~concurrent_stack() {}
+
+ /**********************************************************************//**
+ * @brief Place an object in the stack.
+ * @param data is the item to psh on the stack
+ ***************************************************************************/
+ void push(T const data)
+ {
+ Lock lock(this);
+ S.push(data);
+ num_elements++;
+ }
+
+ /**********************************************************************//**
+ * @brief How many elements are in the stack.
+ * @returns The number of elements in the stack.
+ ***************************************************************************/
+ int size() const
+ {
+ Lock lock(this);
+ return num_elements;
+ }
+
+ /**********************************************************************//**
+ * @brief Determine if the stack is empty.
+ * @returns true if the stack is empty, otherwise false.
+ ***************************************************************************/
+ bool empty() const
+ {
+ Lock lock(this);
+ return (num_elements == 0);
+ }
+
+ /**********************************************************************//**
+ * @brief Attempt to pop an item off the stack.
+ * @param popped_value is an output parameter that contains the object popped
+ * if the stack is successfully popped.
+ * @returns true if a value is popped, otherwise false
+ ***************************************************************************/
+ bool pop(T& popped_value)
+ {
+ Lock lock(this);
+ if (num_elements == 0) return false;
+
+ popped_value = S.top();
+ S.pop();
+ num_elements--;
+ return true;
+ }
+
+ /*-------------------------------------------------------------------------
+ * The class's data
+ *------------------------------------------------------------------------*/
+private:
+ std::stack<T> S; //!< standard stl stack
+ int num_elements;
+
+ /*-------------------------------------------------------------------------
+ * Prevent copy construction and assignment
+ *------------------------------------------------------------------------*/
+private:
+ concurrent_stack(const concurrent_stack&);
+ concurrent_stack& operator=(const concurrent_stack&);
+};
+
+#endif //_U_CONCURRENT_STACK_H_
diff --git a/src/core/dsp/u_lockable.h b/src/core/dsp/u_lockable.h
new file mode 100644
index 0000000..803197f
--- /dev/null
+++ b/src/core/dsp/u_lockable.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+* The Loki Library
+* Copyright (c) 2001 by Andrei Alexandrescu
+* Copyright (c) 2010-2014, Texas Instruments Incorporated
+*
+* This code accompanies the book:
+* Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design
+* Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+* Permission to use, copy, modify, distribute and sell this software for any
+* purpose is hereby granted without fee, provided that the above copyright
+* notice appear in all copies and that both that copyright notice and this
+* permission notice appear in supporting documentation.
+* The author or Addison-Wesley Longman make no representations about the
+* suitability of this software for any purpose. It is provided "as is"
+* without express or implied warranty.
+******************************************************************************/
+
+/**************************************************************************//**
+*
+* @file u_lockable.h
+*
+* @brief Defines a base class that provides a derived class with a Lock type.
+*
+* @version 1.00.00
+*
+* @note The Locakable class is a modified version of the ObjectLevelLockable
+* class from the LOKI library. The copyright from that library is
+* included at the top of this file.
+*
+******************************************************************************/
+#ifndef _U_LOCKABLE_H_
+#define _U_LOCKABLE_H_
+#include "u_locks_pthread.h"
+
+/**************************************************************************//**
+* @brief used as a base class to give your derived class a Lock type.
+* @details Have a class derive from this class and you can lock member
+* functions of your class by defining a lock like this
+* Lock lock(this);
+******************************************************************************/
+class Lockable
+{
+ public:
+ Lockable() : mutex() {} //!< Default Constructor
+ Lockable(const Lockable&) : mutex() {} //!< Copy Constructor
+ ~Lockable() {} //!< Destructor
+
+ /**********************************************************************//**
+ * @brief The Lock type defined by inheriting from Lockable.
+ **************************************************************************/
+ class Lock
+ {
+ public:
+
+ /*******************************************************************//**
+ * @brief Constructing a Lock object will lock the parent object's mutex
+ ***********************************************************************/
+ explicit Lock(const Lockable* host_) : host(*host_)
+ { host.mutex.Lock(); }
+
+ /*******************************************************************//**
+ * @brief Destructing a Lock object will unlock the parent object's mutex
+ ***********************************************************************/
+ ~Lock() { host.mutex.Unlock(); }
+
+ /*******************************************************************//**
+ * @brief Unlock the parent object's mutex
+ ***********************************************************************/
+ void unlock() { host.mutex.Unlock(); }
+
+ /*******************************************************************//**
+ * @brief Return a raw pointer to the parent object's mutex
+ ***********************************************************************/
+ Mutex* raw() { return &host.mutex; }
+
+ private:
+ const Lockable& host; //!< a pointer back to the parent object
+
+ private: // prevent copy construction and assignment
+ Lock(const Lock&);
+ Lock& operator=(const Lock&);
+ };
+
+ protected:
+ mutable Mutex mutex;
+};
+
+/*-----------------------------------------------------------------------------
+* Can use to turn off locking without chaning client code using Lockable
+*----------------------------------------------------------------------------*/
+class Lockable_off
+{
+ public:
+ Lockable_off() {}
+
+ class Lock
+ {
+ public:
+
+ explicit Lock(const Lockable_off* host_) { }
+ void unlock() { }
+
+ private: // prevent copy construction and assignment
+ Lock(const Lock&);
+ Lock& operator=(const Lock&);
+ };
+};
+
+#endif
diff --git a/src/core/dsp/u_locks_pthread.h b/src/core/dsp/u_locks_pthread.h
new file mode 100644
index 0000000..4663a57
--- /dev/null
+++ b/src/core/dsp/u_locks_pthread.h
@@ -0,0 +1,137 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**************************************************************************//**
+*
+* @file u_locks_pthread.h
+*
+* @brief TI implementation classes for mutual exclusion and locking.
+*
+* @ingroup Utilities
+*
+* @version 1.00.00
+*
+******************************************************************************/
+#ifndef _U_LOCKS_PTHREAD_H_
+#define _U_LOCKS_PTHREAD_H_
+
+#include <pthread.h>
+
+/**************************************************************************//**
+* @brief Simple mutex implemented using the pthreads library
+*
+* @details This mutex is simply a wrapper around a pthread mutex. Two regions
+* of code cannot have the mutex locked at the same time.
+*
+******************************************************************************/
+class Mutex
+{
+ public:
+ Mutex() { pthread_mutex_init (&mutex, 0); } //!< Construct a mutex
+ ~Mutex() { pthread_mutex_destroy(&mutex); } //!< Destruct a mutex
+ void Lock() { pthread_mutex_lock (&mutex); } //!< Lock a mutex
+ void Unlock() { pthread_mutex_unlock (&mutex); } //!< Unlock a mutex
+
+ pthread_mutex_t* raw() { return &mutex; } //!< Return raw ptr to underlying
+
+ private:
+ pthread_mutex_t mutex; //!< The underlying pthread mutex
+
+ private: // prevent copy construction and assignment
+ Mutex(const Mutex &);
+ Mutex & operator = (const Mutex &);
+};
+
+/**************************************************************************//**
+* @brief Simple condition variable implemented using the pthreads library.
+*
+* @details Condition variables are synchronization primitives that enable
+* threads to wait until a particular condition occurs. Condition
+* variables enable threads to atomically release a lock and sleep.
+* Condition variables support operations that "wake one" or
+* "wake all" waiting threads. After a thread is woken, it
+* re-acquires the lock it released when the thread entered the
+* sleeping state.
+*
+******************************************************************************/
+class CondVar
+{
+ public:
+
+ CondVar() { pthread_cond_init (&cond, 0); } //!< Constructor
+ ~CondVar() { pthread_cond_destroy (&cond); } //!< Destructor
+
+ /**********************************************************************//**
+ * @brief Signal 1 of N threads waiting on the condition variable
+ **************************************************************************/
+ void notify_one() { pthread_cond_signal (&cond); }
+
+ /**********************************************************************//**
+ * @brief Signal all N threads waiting on the condition variable
+ **************************************************************************/
+ void notify_all() { pthread_cond_broadcast(&cond); }
+
+ /**********************************************************************//**
+ * @brief Wait on the condition variable and release the passed mutex.
+ **************************************************************************/
+ void wait(Mutex* m) { pthread_cond_wait(&cond, m->raw()); }
+
+ private:
+ pthread_cond_t cond; //!< The underlying pthread condition variable
+
+ private: // prevent copy construction and assignment
+ CondVar(CondVar&);
+ CondVar& operator=(CondVar&);
+};
+
+/**************************************************************************//**
+* @brief Objects of this type lock the remainder of the enclosing scope.
+*
+* @details Declare one of these in a scope and pass a mutex reference and the
+* mutex will be locked for the remainder of the scope. This is a
+* safer way to lock and unlock a mutex, because the mutex will
+* automatically be unlocked when the scope level is exited. This
+* helps prevent an unlocked mutex from occuring during exceptions or
+* forgotten early function returns.
+*
+******************************************************************************/
+class ScopedLock
+{
+ public:
+ ScopedLock(Mutex &m) : mutex(m) { mutex.Lock(); } //!< Constructor
+ ~ScopedLock() { mutex.Unlock(); } //!< Destructor
+
+ private:
+ //mutable
+ Mutex& mutex; //!< The Underlying mutex reference
+
+ private: // prevent copy construction and assignment
+ ScopedLock(const ScopedLock&);
+ ScopedLock& operator=(const ScopedLock&);
+};
+
+#endif
diff --git a/src/core/dsp/utils.h b/src/core/dsp/utils.h
new file mode 100644
index 0000000..f125ebd
--- /dev/null
+++ b/src/core/dsp/utils.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __UTILS_H
+#define __UTILS_H
+
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ * element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ * maximum value and couldn't be further incremented.
+ */
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+ bool overflow = false;
+
+ for (unsigned int i=0; i<dims; ++i)
+ {
+ vec[i] += 1;
+
+ if (vec[i] > maxs[i])
+ {
+ vec[i] = 0;
+ overflow = true;
+ }
+ else
+ {
+ overflow = false;
+ break;
+ }
+ }
+
+ return overflow;
+}
+#endif
diff --git a/src/core/dsp/wga.cpp b/src/core/dsp/wga.cpp
new file mode 100644
index 0000000..8269898
--- /dev/null
+++ b/src/core/dsp/wga.cpp
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "wga.h"
+#include <iostream>
+#include <llvm/Pass.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/InstIterator.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "boost/assign/std/set.hpp"
+#include <stdio.h>
+
+using namespace std;
+using namespace boost::assign;
+
+namespace llvm
+{
+
+/******************************************************************************
+* createTIOpenclWorkGroupAggregation
+******************************************************************************/
+Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode)
+{
+ TIOpenclWorkGroupAggregation *fp = new TIOpenclWorkGroupAggregation(
+ is_pocl_mode);
+ return fp;
+}
+
+/**************************************************************************
+* Constructor
+**************************************************************************/
+TIOpenclWorkGroupAggregation::TIOpenclWorkGroupAggregation(bool pocl_mode) :
+ FunctionPass(ID), is_pocl_mode(pocl_mode)
+{
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) IVPhi[i] = 0;
+}
+
+/**************************************************************************
+* Get index variable
+* 1. Original mode, only one loop inserted: return IVPhi[]
+* 2. pocl mode, multiple loops inserted: return a new LoadInst
+**************************************************************************/
+llvm::Instruction* TIOpenclWorkGroupAggregation::get_IV(Function &F,
+ CallInst *call)
+{
+ llvm::Value *ivx, *ivy, *ivz;
+ Value *arg = call->getArgOperand(0);
+ uint32_t dim = 9999;
+
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ dim = constInt->getSExtValue();
+
+ if (is_pocl_mode)
+ {
+ llvm::GlobalValue *iv;
+ if (dim == 2)
+ iv = F.getParent()->getNamedGlobal("_local_id_z");
+ else if (dim == 1)
+ iv = F.getParent()->getNamedGlobal("_local_id_y");
+ else if (dim == 0)
+ iv = F.getParent()->getNamedGlobal("_local_id_x");
+ if (dim != 9999) return new LoadInst(iv);
+
+ ivx = F.getParent()->getNamedGlobal("_local_id_x");
+ ivy = F.getParent()->getNamedGlobal("_local_id_y");
+ ivz = F.getParent()->getNamedGlobal("_local_id_z");
+ }
+ else
+ {
+ if (dim != 9999) return IVPhi[dim];
+
+ ivx = IVPhi[0];
+ ivy = IVPhi[1];
+ ivz = IVPhi[2];
+ }
+
+ // not constant arg: return (arg == 2) ? ivz : (arg == 1 ? ivy : ivx)
+ Type *Int32 = Type::getInt32Ty(F.getContext());
+ Value *one = ConstantInt::get(Int32, 1);
+ Value *two = ConstantInt::get(Int32, 2);
+ llvm::Value *cyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, two);
+ llvm::Value *syx = SelectInst::Create(cyx, ivy, ivx, "", call);
+ llvm::Value *czyx = new ICmpInst(call, ICmpInst::ICMP_EQ, arg, one);
+ return SelectInst::Create(czyx, ivz, syx, "", is_pocl_mode ? NULL : call);
+}
+
+/**************************************************************************
+* runOnFunction(Function &F)
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::runOnFunction(Function &F)
+{
+ /*-------------------------------------------------------------------------
+ * Determine how many dimensions are referenced using OpenCL getXXX
+ * functions, and record them all for later rewrite.
+ *------------------------------------------------------------------------*/
+ int dims;
+ if (!is_pocl_mode) dims = findNeededLoopNest(F);
+
+ /*-------------------------------------------------------------------------
+ * Add a loop nest for each dimension referenced that requires a workitem
+ * id.
+ *------------------------------------------------------------------------*/
+ if (!is_pocl_mode) for (int i = 0; i < dims; ++i) add_loop(F, i);
+
+ /*-------------------------------------------------------------------------
+ * rewrite the alloca() generated during pocl llvm work-group aggregation
+ *------------------------------------------------------------------------*/
+ if (is_pocl_mode) rewrite_allocas(F);
+
+ /*-------------------------------------------------------------------------
+ * rewrite the OpenCL getXXX dimension query functions to reference the info
+ * packet for the workgroup. Return true if we modified the function.
+ *------------------------------------------------------------------------*/
+ return rewrite_ocl_funcs(F);
+}
+
+/******************************************************************************
+* getAnalysisUsage(AnalysisUsage &Info) const
+******************************************************************************/
+void TIOpenclWorkGroupAggregation::getAnalysisUsage(AnalysisUsage &Info) const
+{
+ /*-------------------------------------------------------------------------
+ * This will ensure that all returns go through a single exit node, which
+ * our WGA loop generation algorithm depends on.
+ *------------------------------------------------------------------------*/
+ Info.addRequired<UnifyFunctionExitNodes>();
+}
+
+/**************************************************************************
+* findNeededLoopNest(Function &F)
+**************************************************************************/
+unsigned int TIOpenclWorkGroupAggregation::findNeededLoopNest(Function &F)
+{
+ unsigned int maxDim = 0;
+
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if (CallInst * callInst = dyn_cast<CallInst>(&*I))
+ {
+ if (!callInst->getCalledFunction()) continue;
+ string functionName(callInst->getCalledFunction()->getName());
+
+ if (functionName == "get_local_id" ||
+ functionName == "get_global_id")
+ {
+ Value *arg = callInst->getArgOperand(0);
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ {
+ unsigned int dimIdx = constInt->getSExtValue();
+ dimIdx = min(MAX_DIMENSIONS-1, dimIdx);
+ maxDim = max(maxDim, dimIdx + 1);
+ }
+
+ /*-------------------------------------------------------------
+ * if the work group function has a variable argument, then
+ * assume worst case and return 3 loop levels are needed.
+ *------------------------------------------------------------*/
+ else return 3;
+ }
+ }
+
+ return maxDim;
+}
+
+/**************************************************************************
+* createLoadGlobal
+* Create an aligned 32 bit load from a global address.
+**************************************************************************/
+Instruction* TIOpenclWorkGroupAggregation::createLoadGlobal
+ (int32_t idx, Module* M, Instruction *before, const char *name)
+{
+ llvm::ArrayType *type = ArrayType::get(
+ IntegerType::getInt32Ty(getGlobalContext()), 64);
+ llvm::Value* dummy = M->getOrInsertGlobal("kernel_config_l2", type);
+
+ GlobalVariable* global = M->getNamedGlobal("kernel_config_l2");
+
+ std::vector<Value*> indices;
+ indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), 0));
+ indices.push_back(ConstantInt::get(IntegerType::getInt32Ty(getGlobalContext()), idx));
+
+ Constant* gep = ConstantExpr::getInBoundsGetElementPtr (global, indices);
+ LoadInst* ld = new LoadInst(gep, name, before);
+
+ ld->setAlignment(4);
+ return ld;
+}
+
+/******************************************************************************
+* findDim
+******************************************************************************/
+unsigned int TIOpenclWorkGroupAggregation::findDim(class CallInst* call)
+{
+ Value *arg = call->getArgOperand(0);
+
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ return constInt->getSExtValue();
+ return 100; // who knows
+}
+
+/**************************************************************************
+* rewrite allocas to _wg_alloca(sizeinbytes)
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::rewrite_allocas(Function &F)
+{
+ int wi_alloca_size = 0;
+ Module *M = F.getParent();
+ AllocaInst *alloca;
+
+ std::vector<AllocaInst *> allocas;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if ((alloca = dyn_cast<AllocaInst>(&*I)) != NULL)
+ allocas.push_back(alloca);
+ if (allocas.empty()) return false;
+
+ DataLayout dataLayout(M);
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*Params=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *wg_alloca = dyn_cast<Function>(
+ M->getOrInsertFunction("_wg_alloca", ft));
+ Type *Int32 = Type::getInt32Ty(M->getContext());
+
+ for (std::vector<AllocaInst *>::iterator I = allocas.begin();
+ I != allocas.end(); ++I)
+ {
+ alloca = *I;
+
+ // get number of elements, element type size, compute total size
+ Value *numElems = alloca->getArraySize();
+ // YUAN TODO: skip regular constant numElems?
+
+ Type *elementType = alloca->getAllocatedType();
+ // getTypeSizeInBits(), what about uchar3 type?
+ uint64_t esBytes = dataLayout.getTypeStoreSize(elementType);
+ Value *esize = ConstantInt::get(Int32, (uint32_t) esBytes);
+ Instruction *alloca_size = BinaryOperator::Create(
+ Instruction::Mul, esize, numElems, "", alloca);
+ SmallVector<Value *, 4> args;
+ args.push_back(alloca_size);
+
+ // create function call: _wg_alloca(alloca_size)
+ CallInst *f_alloca = CallInst::Create(
+ wg_alloca, ArrayRef<Value *>(args), "", alloca);
+
+ // cast to alloca type
+ Instruction * new_alloca = new IntToPtrInst(
+ f_alloca, alloca->getType());
+
+ // replace AllocaInst with new _wg_alloca()
+ ReplaceInstWithInst(alloca, new_alloca);
+
+ // accumulate element type size
+ unsigned align = dataLayout.getPrefTypeAlignment(elementType);
+ wi_alloca_size = (wi_alloca_size + align - 1) & (~(align-1));
+ wi_alloca_size += esBytes;
+ }
+
+ // initialize _wg_alloca_start and _wg_alloca_size
+ // _wg_alloca_size = load(packetaddr+offset);
+ // _wg_alloca_start = load(packetaddr+offset) + __core_num() * _wg_alloca_size;
+ Instruction *inspt = F.getEntryBlock().getFirstNonPHI();
+ FunctionType *core_num_ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *core_num = dyn_cast<Function>(
+ M->getOrInsertFunction("__core_num", core_num_ft));
+ Instruction *f_core_num = CallInst::Create(core_num, "", inspt);
+
+ Instruction *wg_alloca_size = createLoadGlobal(17, M, inspt);
+
+ Instruction *shift = BinaryOperator::Create(Instruction::Mul, f_core_num,
+ wg_alloca_size, "", inspt);
+
+ Instruction *start = createLoadGlobal(16, M, inspt);
+
+ Instruction *core_start = BinaryOperator::Create(
+ Instruction::Add, start, shift, "", inspt);
+ Value *gv = M->getOrInsertGlobal("_wg_alloca_start", Int32);
+ GlobalVariable *wg_gv = M->getNamedGlobal("_wg_alloca_start");
+ wg_gv->setSection(StringRef("far"));
+ Instruction *store = new StoreInst(core_start, gv, inspt);
+
+ // put total orig_wi_size into attributes data in the function
+ char *s_wi_alloca_size = new char[32]; // we have to leak this
+ snprintf(s_wi_alloca_size, 32, "_wi_alloca_size=%d", wi_alloca_size);
+ F.addFnAttr(StringRef(s_wi_alloca_size));
+
+ return true;
+}
+
+/**************************************************************************
+* rewrite_ocl_funcs
+**************************************************************************/
+bool TIOpenclWorkGroupAggregation::rewrite_ocl_funcs(Function &F)
+{
+ CallInst *call;
+ Module *M = F.getParent();
+ std::vector<CallInst *> wi_calls;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ {
+ if ((call = dyn_cast<CallInst>(&*I)) == NULL) continue;
+ if (call->getCalledFunction() == NULL) continue;
+ string name(call->getCalledFunction()->getName());
+ if (name != "get_local_id" && name != "get_local_size") continue;
+ wi_calls.push_back(call);
+ }
+ if (wi_calls.empty()) return false;
+
+ LLVMContext &ctx = F.getContext();
+ std::vector<CallInst *>::iterator I, E;
+ for (I = wi_calls.begin(), E = wi_calls.end(); I != E; ++I)
+ {
+ call = *I;
+ string name(call->getCalledFunction()->getName());
+
+ if (name == "get_local_id")
+ {
+ if (is_pocl_mode)
+ {
+ ReplaceInstWithInst(call, get_IV(F, call));
+ }
+ else
+ {
+ BasicBlock::iterator BI(call);
+ ReplaceInstWithValue(call->getParent()->getInstList(), BI,
+ get_IV(F, call));
+ }
+ }
+ else if (name == "get_local_size")
+ {
+ // remaining get_local_size() are generated by pocl,
+ // arguments guaranteed to be constants: 0, 1, or 2
+ ReplaceInstWithInst(call,
+ createLoadGlobal(4+findDim(call), M));
+ }
+ }
+ return true;
+}
+
+BasicBlock* TIOpenclWorkGroupAggregation::findExitBlock(Function &F)
+{
+ BasicBlock *exit = 0;
+
+ /*-------------------------------------------------------------------------
+ * Find the one block with no successors
+ *------------------------------------------------------------------------*/
+ for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B)
+ if ((*B).getTerminator()->getNumSuccessors() == 0)
+ if (!exit) exit = &(*B);
+ else assert(false);
+
+ /*-------------------------------------------------------------------------
+ * Split the return off into it's own block
+ *------------------------------------------------------------------------*/
+ Instruction *ret = exit->getTerminator();
+
+ if (ret != &exit->front())
+ exit = SplitBlock(exit, ret, this);
+
+ return exit;
+}
+
+/**************************************************************************
+* add_loop(Function &F)
+**************************************************************************/
+void TIOpenclWorkGroupAggregation::add_loop(Function &F, int dimIdx)
+{
+ LLVMContext &ctx = F.getContext();
+ Type *Int32 = Type::getInt32Ty(ctx);
+ Value *zero = ConstantInt::get(Int32, 0);
+ Value *one = ConstantInt::get(Int32, 1);
+ Module *M = F.getParent();
+
+ BasicBlock* exit = findExitBlock(F);
+ BasicBlock* entry = &(F.getEntryBlock());
+
+ BasicBlock* bodytop = SplitBlock(entry, &entry->front(), this);
+ BasicBlock* bodyend = exit;
+ exit = SplitBlock(bodyend, &exit->front(), this);
+
+ exit->setName(".exit");
+ entry->setName(".entry");
+ bodytop->setName(".bodyTop");
+ bodyend->setName(".bodyEnd");
+
+ /*----------------------------------------------------------------------
+ * Populate the branch around
+ *---------------------------------------------------------------------*/
+ Instruction *branch = entry->getTerminator();
+ Instruction *ld_upper_bnd = createLoadGlobal(4+dimIdx, M, branch);
+
+ Instruction *cmp = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SGT,
+ ld_upper_bnd, zero, "", branch);
+
+ Instruction *cbr = BranchInst::Create(bodytop, exit, cmp);
+ ReplaceInstWithInst(branch, cbr);
+
+ /*----------------------------------------------------------------------
+ * Add the phi node to the top of the body
+ *---------------------------------------------------------------------*/
+ PHINode *phi = PHINode::Create(Int32, 0, "", &bodytop->front());
+ phi->addIncoming(zero, entry);
+
+ /*----------------------------------------------------------------------
+ * Add the loop control to the bottom of the bodyend
+ *---------------------------------------------------------------------*/
+ branch = bodyend->getTerminator();
+ Instruction *inc = BinaryOperator::Create(Instruction::Add,
+ phi, one, Twine(), branch);
+
+ Instruction *ld_upper_bnd2 = createLoadGlobal(4+dimIdx, M, branch);
+ Instruction *cmp2 = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_SLT,
+ inc, ld_upper_bnd2, "", branch);
+
+ Instruction *cbr2 = BranchInst::Create(bodytop, exit, cmp2);
+ ReplaceInstWithInst(branch, cbr2);
+
+ phi->addIncoming(inc, bodyend);
+ IVPhi[dimIdx] = phi;
+
+ // YUAN TODO: maybe handled better later
+ if (dimIdx < 1) IVPhi[1] = phi;
+ if (dimIdx < 2) IVPhi[2] = phi;
+}
+
+char TIOpenclWorkGroupAggregation::ID = 0;
+static RegisterPass<TIOpenclWorkGroupAggregation>
+ X("wga", "Work Group Aggregation", false, false);
+
+}
diff --git a/src/core/dsp/wga.h b/src/core/dsp/wga.h
new file mode 100644
index 0000000..8728fea
--- /dev/null
+++ b/src/core/dsp/wga.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __TIOPENCLWORKGROUPAGGREGATIONPASS_H
+#define __TIOPENCLWORKGROUPAGGREGATIONPASS_H
+
+#include <string>
+#include <set>
+#include "boost/tuple/tuple.hpp"
+#include <llvm/Pass.h>
+#include <llvm/IR/Instruction.h>
+
+#define MAX_DIMENSIONS 3u
+
+namespace llvm
+{
+
+class TIOpenclWorkGroupAggregation : public FunctionPass
+{
+ public:
+ static char ID;
+
+ TIOpenclWorkGroupAggregation(bool pocl_mode = false);
+ virtual bool runOnFunction(Function &F);
+ virtual void getAnalysisUsage(AnalysisUsage &Info) const;
+
+ private:
+ Instruction* IVPhi[MAX_DIMENSIONS];
+ bool is_pocl_mode;
+
+ private:
+ Instruction* createLoadGlobal(int32_t idx, Module* m, Instruction *before=0,
+ const char *name=0);
+
+ BasicBlock* findExitBlock (Function &F);
+ unsigned int findNeededLoopNest(Function &F);
+ unsigned int findDim (class CallInst* call);
+ bool rewrite_ocl_funcs (Function &F);
+ void add_loop (Function &F, int dimIdx);
+ Instruction* get_IV(Function &F, CallInst *call);
+ bool rewrite_allocas(Function &F);
+};
+
+Pass *createTIOpenclWorkGroupAggregationPass(bool is_pocl_mode = false);
+
+}
+
+#endif // __TIOPENCLWORKGROUPAGGREGATIONPASS_H
diff --git a/src/core/dsp/worker.cpp b/src/core/dsp/worker.cpp
new file mode 100644
index 0000000..79223f0
--- /dev/null
+++ b/src/core/dsp/worker.cpp
@@ -0,0 +1,519 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "device.h"
+#include "buffer.h"
+#include "kernel.h"
+#include "driver.h"
+
+#include "../commandqueue.h"
+#include "../events.h"
+#include "../memobject.h"
+#include "../kernel.h"
+
+#include <stdlib.h>
+#include <iostream>
+#include <string.h>
+
+#include "u_locks_pthread.h"
+
+using namespace Coal;
+
+#define ERR(status, msg) if (status) { printf("ERROR: %s\n", msg); exit(-1); }
+
+/******************************************************************************
+* handle_event_completion
+******************************************************************************/
+void handle_event_completion(DSPDevice *device)
+{
+ int k_id = device->mail_from();
+
+ /*-------------------------------------------------------------------------
+ * If this is a false completion message due to prinft traffic, etc.
+ *------------------------------------------------------------------------*/
+ if (k_id < 0) return;
+
+ Event* event;
+ bool found = device->get_complete_pending(k_id, event);
+ if (!found)
+ {
+ std::cout << "Completion status received for kernel Id " << k_id <<
+ " but no pending event found for that id" << std::endl;
+ exit(-1);
+ }
+
+ KernelEvent *e = (KernelEvent *) event;
+ DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData();
+ ke->free_tmp_bufs();
+
+ CommandQueue *queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus(Event::Complete);
+}
+
+
+/******************************************************************************
+* handle_event_dispatch
+******************************************************************************/
+bool handle_event_dispatch(DSPDevice *device)
+{
+ bool stop = false;
+ cl_int errcode;
+ Event * event;
+
+ event = device->getEvent(stop);
+
+ /*---------------------------------------------------------------------
+ * Ensure we have a good event and we don't have to stop
+ *--------------------------------------------------------------------*/
+ if (stop) return true;
+ if (!event) return false;
+
+ /*---------------------------------------------------------------------
+ * Get info about the event and its command queue
+ *--------------------------------------------------------------------*/
+ Event::Type t = event->type();
+ CommandQueue * queue = 0;
+ cl_command_queue_properties queue_props = 0;
+
+ errcode = CL_SUCCESS;
+
+ event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
+
+ if (queue)
+ queue->info(CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ &queue_props, 0);
+
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::Start);
+
+ /*---------------------------------------------------------------------
+ * Execute the action
+ *--------------------------------------------------------------------*/
+ switch (t)
+ {
+ case Event::ReadBuffer:
+ case Event::WriteBuffer:
+ {
+ ReadWriteBufferEvent *e = (ReadWriteBufferEvent *)event;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ {
+ if (t == Event::ReadBuffer)
+ memcpy(e->ptr(), e->buffer()->host_ptr(), e->cb());
+ else memcpy(e->buffer()->host_ptr(), e->ptr(), e->cb());
+ break;
+ }
+
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset();
+
+ if (t == Event::ReadBuffer)
+ Driver::instance()->read(device->dspID(), data,
+ (uint8_t*)e->ptr(), e->cb());
+
+ else
+ Driver::instance()->write(device->dspID(), data,
+ (uint8_t*)e->ptr(), e->cb());
+
+ break;
+ }
+
+ case Event::CopyBuffer:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#else
+ CopyBufferEvent *e = (CopyBufferEvent *)event;
+
+ DSPDevicePtr64 src_addr;
+ DSPDevicePtr64 dst_addr;
+
+ void *psrc;
+ void *pdst;
+
+ if (e->source()->flags() & CL_MEM_USE_HOST_PTR)
+ psrc = (char*)e->source()->host_ptr() + e->src_offset();
+ else
+ {
+ DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device);
+ src_addr = (DSPDevicePtr64)src->data() + e->src_offset();
+ psrc = Driver::instance()->map(src_addr, e->cb(), true);
+ }
+
+ if (e->destination()->flags() & CL_MEM_USE_HOST_PTR)
+ pdst = (char *)e->destination()->host_ptr() + e->dst_offset();
+ else
+ {
+ DSPBuffer *dst = (DSPBuffer*)e->destination()->deviceBuffer(device);
+ dst_addr = (DSPDevicePtr64)dst->data() + e->dst_offset();
+ pdst = Driver::instance()->map(dst_addr, e->cb(), false);
+ }
+
+ memcpy(pdst, psrc, e->cb());
+
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(psrc, src_addr, e->cb(), false);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(pdst, dst_addr, e->cb(), true);
+#endif
+ break;
+ }
+
+ case Event::ReadBufferRect:
+ case Event::WriteBufferRect:
+ {
+ ReadWriteBufferRectEvent *e = (ReadWriteBufferRectEvent *)event;
+
+ // Calculate the start points for each block of memory referenced
+ DSPDevicePtr64 buf_start;
+ uint8_t * host_start;
+
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ buf_start = (DSPDevicePtr64)e->buffer()->host_ptr();
+ else
+ buf_start = ((DSPBuffer *)e->source()->deviceBuffer(device))
+ ->data();
+
+ buf_start += e->src_origin(2) * e->src_slice_pitch() +
+ e->src_origin(1) * e->src_row_pitch() +
+ e->src_origin(0);
+
+ host_start = (uint8_t *)e->ptr() +
+ e->dst_origin(2) * e->dst_slice_pitch() +
+ e->dst_origin(1) * e->dst_row_pitch() +
+ e->dst_origin(0);
+
+ // Map the device/host buffers to the appopriate src/dst operands
+ // based on the requested operation (read vs write)
+ DSPDevicePtr64 src_start, dst_start;
+
+ size_t src_row_pitch, dst_row_pitch;
+ size_t src_slice_pitch, dst_slice_pitch;
+
+ if (t == Event::ReadBufferRect)
+ {
+ src_start = buf_start;
+ src_row_pitch = e->src_row_pitch();
+ src_slice_pitch = e->src_slice_pitch();
+
+ dst_start = (DSPDevicePtr64) host_start;
+ dst_row_pitch = e->dst_row_pitch();
+ dst_slice_pitch = e->dst_slice_pitch();
+ }
+ else
+ {
+ src_start = (DSPDevicePtr64) host_start;
+ src_row_pitch = e->dst_row_pitch();
+ src_slice_pitch = e->dst_slice_pitch();
+
+ dst_start = buf_start;
+ dst_row_pitch = e->src_row_pitch();
+ dst_slice_pitch = e->src_slice_pitch();
+ }
+
+ // The dimensions of the region to be copied gives us our
+ // loop boundaries for copying
+ cl_ulong xdim = e->region(0);
+ cl_ulong ydim = e->region(1);
+ cl_ulong zdim = e->region(2);
+
+ // Set up the start point
+ DSPDevicePtr64 src_cur_slice = src_start;
+ DSPDevicePtr64 dst_cur_slice = dst_start;
+
+ // The outer loop handles each z-axis slice
+ // For 2-D copy, will only iterate once (zdim=1)
+ for(cl_uint z = 0; z < zdim; z++)
+ {
+ DSPDevicePtr64 src_cur_row = src_cur_slice;
+ DSPDevicePtr64 dst_cur_row = dst_cur_slice;
+
+ // The inner loop handles each row of the current slice
+ for(cl_uint y = 0; y < ydim; y++)
+ {
+ // Copy a row
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR)
+ memcpy((void *)dst_cur_row, (void *)src_cur_row, xdim);
+ else
+ {
+ if (t == Event::ReadBufferRect)
+ Driver::instance()->read(device->dspID(),
+ src_cur_row, (uint8_t *)dst_cur_row, xdim);
+ else
+ Driver::instance()->write(device->dspID(),
+ dst_cur_row, (uint8_t *)src_cur_row, xdim);
+ }
+
+ // Proceed to next row
+ src_cur_row += src_row_pitch;
+ dst_cur_row += dst_row_pitch;
+ }
+
+ // Proceed to next slice
+ src_cur_slice += src_slice_pitch;
+ dst_cur_slice += dst_slice_pitch;
+ }
+ break;
+ }
+
+ case Event::CopyBufferRect:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#else
+ CopyBufferRectEvent *e = (CopyBufferRectEvent *)event;
+
+ // Calculate the offsets into each buffer
+ size_t src_offset, dst_offset;
+
+ src_offset = e->src_origin(2) * e->src_slice_pitch() +
+ e->src_origin(1) * e->src_row_pitch() +
+ e->src_origin(0);
+
+ dst_offset = e->dst_origin(2) * e->dst_slice_pitch() +
+ e->dst_origin(1) * e->dst_row_pitch() +
+ e->dst_origin(0);
+
+ // Set up start points for the copy. If it is a DSP buffer, we'll
+ // need to map the buffer before copying (done in copy loop below)
+ DSPDevicePtr64 src_start, dst_start;
+
+ if (e->source()->flags() & CL_MEM_USE_HOST_PTR)
+ src_start = (DSPDevicePtr64)e->source()->host_ptr() + src_offset;
+ else
+ {
+ DSPBuffer *src = (DSPBuffer*)e->source()->deviceBuffer(device);
+ src_start = src->data() + src_offset;
+ }
+
+ if (e->destination()->flags() & CL_MEM_USE_HOST_PTR)
+ dst_start = (DSPDevicePtr64)e->destination()->host_ptr() + dst_offset;
+ else
+ {
+ DSPBuffer *dst=(DSPBuffer*)e->destination()->deviceBuffer(device);
+ dst_start = dst->data() + dst_offset;
+ }
+
+ // The dimensions of the region to be copied
+ cl_ulong xdim = e->region(0);
+ cl_ulong ydim = e->region(1);
+ cl_ulong zdim = e->region(2);
+
+ // If we need to map memory we will currently map a slice
+ // at a time. So determine the size of a 2D slice
+ size_t src_slice_size = ydim * e->src_row_pitch()-e->src_origin(0);
+ size_t dst_slice_size = ydim * e->dst_row_pitch()-e->dst_origin(0);
+
+ // Set up the initial copy point
+ DSPDevicePtr64 src_cur_slice = src_start;
+ DSPDevicePtr64 dst_cur_slice = dst_start;
+
+ // The outer loop handles each z-axis slice
+ // For 2-D copy, will only iterate once (zdim=1)
+ for(cl_ulong z = 0; z < zdim; z++)
+ {
+ uint8_t *src_cur_row = (uint8_t *)src_cur_slice;
+ uint8_t *dst_cur_row = (uint8_t *)dst_cur_slice;
+ uint8_t *src_cur_mslice, *dst_cur_mslice;
+
+ // If necessary, memory map a slice of buffer
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ src_cur_row = src_cur_mslice = (uint8_t *)
+ Driver::instance()->map(src_cur_slice, src_slice_size,true);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ dst_cur_row = dst_cur_mslice = (uint8_t *)
+ Driver::instance()->map(dst_cur_slice, dst_slice_size,false);
+
+ // The inner loop handles each row of the current slice
+ for(cl_ulong y = 0; y < ydim; y++)
+ {
+ // Copy current row
+ memcpy(dst_cur_row, src_cur_row, xdim);
+
+ // Proceed to next row
+ src_cur_row += e->src_row_pitch();
+ dst_cur_row += e->dst_row_pitch();
+ }
+
+ // If necessary, unmap the current slice
+ if (!(e->source()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(src_cur_mslice, src_cur_slice,
+ src_slice_size, false);
+
+ if (!(e->destination()->flags() & CL_MEM_USE_HOST_PTR))
+ Driver::instance()->unmap(dst_cur_mslice, dst_cur_slice,
+ dst_slice_size, true);
+
+ // Proceed to next slice
+ src_cur_slice += e->src_slice_pitch();
+ dst_cur_slice += e->dst_slice_pitch();
+ }
+#endif
+ break;
+ }
+
+ case Event::ReadImage:
+ case Event::WriteImage:
+ case Event::CopyImage:
+ case Event::CopyBufferToImage:
+ case Event::CopyImageToBuffer:
+ case Event::MapImage:
+ {
+ std::cerr << "Images are not supported" << std::endl;
+ break;
+ }
+
+ case Event::MapBuffer:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#endif
+ MapBufferEvent *e = (MapBufferEvent *)event;
+
+ /*-----------------------------------------------------------
+ * for USE_HOST_PTR, the buffer store is already on the host and
+ * map should not be needed.
+ -----------------------------------------------------------*/
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break;
+
+ clRetainEvent((cl_event) e);
+ if(! e->buffer()->addMapEvent(e))
+ ERR(1, "MapBuffer: Range conflicts with previous maps");
+ if ((e->flags() & CL_MAP_READ) != 0)
+ {
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 data = (DSPDevicePtr64)buf->data() + e->offset();
+ Driver::instance()->map(data, e->cb(), true);
+ }
+ break;
+ }
+ case Event::UnmapMemObject:
+ {
+#ifdef DSPC868X
+ std::cerr << "Event type not yet supported" << std::endl;
+#endif
+ UnmapBufferEvent *e = (UnmapBufferEvent *)event;
+
+ /*-----------------------------------------------------------
+ * for USE_HOST_PTR, the buffer store is already on the host and
+ * unmap should not be needed.
+ -----------------------------------------------------------*/
+ if (e->buffer()->flags() & CL_MEM_USE_HOST_PTR) break;
+
+ if (e->buffer()->type() != Coal::MemObject::Buffer &&
+ e->buffer()->type() != Coal::MemObject::SubBuffer)
+ ERR(1, "UnmapMemObject: MapImage/Unmap not support yet");
+ MapBufferEvent *mbe = (MapBufferEvent *)
+ e->buffer()->removeMapEvent(e->mapping());
+ if (mbe == NULL)
+ ERR(1, "UnmapMemObject: host_ptr not from previous maps");
+
+ if ((mbe->flags() & CL_MAP_WRITE) != 0)
+ {
+ DSPBuffer *buf = (DSPBuffer *)e->buffer()->deviceBuffer(device);
+ DSPDevicePtr64 buf_dsp_addr = (DSPDevicePtr64)buf->data();
+ Driver::instance()->unmap(e->mapping(), buf_dsp_addr,
+ mbe->cb(), true);
+ }
+ if (queue) queue->releaseEvent(mbe);
+ break;
+ }
+
+ case Event::NativeKernel:
+ {
+ std::cerr << "Native Kernels not supported on the DSP" << std::endl;
+ break;
+ }
+
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ KernelEvent *e = (KernelEvent *) event;
+ DSPKernelEvent *ke = (DSPKernelEvent *)e->deviceData();
+
+ errcode = ke->run(t);
+
+ /*-----------------------------------------------------------------
+ * Put the event on a pending completion list and its
+ * completion will be handled asynchronously.
+ *----------------------------------------------------------------*/
+ if (errcode == CL_SUCCESS)
+ {
+ device->push_complete_pending(ke->kernel_id(), e);
+ return false;
+ }
+ break;
+ }
+ default: break;
+ }
+
+ /*---------------------------------------------------------------------
+ * Cleanup
+ *--------------------------------------------------------------------*/
+
+ // an event may be released once it is Complete
+ if (queue_props & CL_QUEUE_PROFILING_ENABLE)
+ event->updateTiming(Event::End);
+ event->setStatus((errcode == CL_SUCCESS) ? Event::Complete :
+ (Event::Status)errcode);
+
+ return false;
+}
+
+/******************************************************************************
+* dsp_worker
+******************************************************************************/
+void *dsp_worker(void *data)
+{
+ DSPDevice *device = (DSPDevice *)data;
+
+ while (true)
+ {
+ if (device->any_complete_pending() && device->mail_query())
+ handle_event_completion(device);
+
+ bool stop = device->stop();
+
+ if (!stop && device->availableEvent())
+ stop |= handle_event_dispatch(device);
+
+ if (stop && !device->any_complete_pending()) break;
+ }
+}
diff --git a/src/core/events.cpp b/src/core/events.cpp
new file mode 100644
index 0000000..629a0c9
--- /dev/null
+++ b/src/core/events.cpp
@@ -0,0 +1,1519 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file events.cpp
+ * \brief Events inheriting \c Coal::Event
+ */
+
+#include "events.h"
+#include "commandqueue.h"
+#include "memobject.h"
+#include "kernel.h"
+#include "deviceinterface.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+/*
+ * Read/Write buffers
+ */
+
+BufferEvent::BufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_buffer(buffer)
+{
+ clRetainMemObject((cl_mem) p_buffer);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Correct buffer
+ if (!buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Buffer's context must match the CommandQueue one
+ Context *ctx = 0;
+ *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &ctx, 0);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if ((Context *)buffer->parent() != ctx)
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return;
+ }
+
+ // Alignment of SubBuffers
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(buffer, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!buffer->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+BufferEvent::~BufferEvent()
+{
+ clReleaseMemObject((cl_mem) p_buffer);
+}
+
+MemObject *BufferEvent::buffer() const
+{
+ return p_buffer;
+}
+
+bool BufferEvent::isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device)
+{
+ cl_uint align;
+ cl_int rs;
+
+ if (buffer->type() != MemObject::SubBuffer)
+ return true;
+
+ rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
+ &align, 0);
+
+ if (rs != CL_SUCCESS)
+ return false;
+
+ size_t mask = 0;
+ if (align != 0) mask = align - 1;
+
+ if (((SubBuffer *)buffer)->offset() & mask)
+ return false;
+
+ return true;
+}
+
+ReadWriteBufferEvent::ReadWriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_offset(offset), p_cb(cb), p_ptr(ptr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for out-of-bounds reads
+ if (!ptr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (offset + cb > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t ReadWriteBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+size_t ReadWriteBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+void *ReadWriteBufferEvent::ptr() const
+{
+ return p_ptr;
+}
+
+ReadBufferEvent::ReadBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{}
+
+Event::Type ReadBufferEvent::type() const
+{
+ return Event::ReadBuffer;
+}
+
+WriteBufferEvent::WriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferEvent(parent, buffer, offset, cb, ptr, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{}
+
+Event::Type WriteBufferEvent::type() const
+{
+ return Event::WriteBuffer;
+}
+
+MapBufferEvent::MapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ cl_map_flags map_flags,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_offset(offset), p_cb(cb), p_map_flags(map_flags)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check flags
+ if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds values
+ if (offset + cb > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type MapBufferEvent::type() const
+{
+ return Event::MapBuffer;
+}
+
+size_t MapBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+size_t MapBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+cl_map_flags MapBufferEvent::flags() const
+{
+ return p_map_flags;
+}
+
+void *MapBufferEvent::ptr() const
+{
+ return p_ptr;
+}
+
+void MapBufferEvent::setPtr(void *ptr)
+{
+ p_ptr = ptr;
+}
+
+MapImageEvent::MapImageEvent(CommandQueue *parent,
+ Image2D *image,
+ cl_map_flags map_flags,
+ const size_t origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent (parent, image, num_events_in_wait_list, event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check flags
+ if (map_flags & ~(CL_MAP_READ | CL_MAP_WRITE))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Copy the vectors
+ if (origin)
+ std::memcpy(&p_origin, origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_origin, 0, 3 * sizeof(size_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if (!region[i])
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_region[i] = region[i];
+ }
+
+ // Multiply the elements (for images)
+ p_region[0] *= image->pixel_size();
+ p_origin[0] *= image->pixel_size();
+
+ // Check for overflow
+ if (image->type() == MemObject::Image2D &&
+ (origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_origin[0] + p_region[0]) > image->row_pitch() ||
+ (p_origin[1] + p_region[1]) * image->row_pitch() > image->slice_pitch() ||
+ (p_origin[2] + p_region[2]) * image->slice_pitch() > image->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type MapImageEvent::type() const
+{
+ return Event::MapImage;
+}
+
+
+cl_map_flags MapImageEvent::flags() const
+{
+ return p_map_flags;
+}
+
+size_t MapImageEvent::origin (unsigned int index) const
+{
+ return p_origin[index];
+}
+
+size_t MapImageEvent::region (unsigned int index) const
+{
+ return p_region[index];
+}
+
+size_t MapImageEvent::row_pitch() const
+{
+ return p_row_pitch;
+}
+
+size_t MapImageEvent::slice_pitch() const
+{
+ return p_slice_pitch;
+}
+
+void *MapImageEvent::ptr() const
+{
+ return p_ptr;
+}
+
+void MapImageEvent::setRowPitch (size_t row_pitch)
+{
+ p_row_pitch = row_pitch;
+}
+
+void MapImageEvent::setSlicePitch (size_t slice_pitch)
+{
+ p_slice_pitch = slice_pitch;
+}
+
+void MapImageEvent::setPtr (void *ptr)
+{
+ p_ptr = ptr;
+}
+
+UnmapBufferEvent::UnmapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ void *mapped_addr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, buffer, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_mapping(mapped_addr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // TODO: Check that p_mapping is ok (will be done in the drivers)
+ if (!mapped_addr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+Event::Type UnmapBufferEvent::type() const
+{
+ return Event::UnmapMemObject;
+}
+
+void *UnmapBufferEvent::mapping() const
+{
+ return p_mapping;
+}
+
+CopyBufferEvent::CopyBufferEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent(parent, source, num_events_in_wait_list, event_wait_list,
+ errcode_ret), p_destination(destination), p_src_offset(src_offset),
+ p_dst_offset(dst_offset), p_cb(cb)
+{
+ clRetainMemObject((cl_mem) p_destination);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!destination)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if (src_offset + cb > source->size() ||
+ dst_offset + cb > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for overlap
+ if (source == destination)
+ {
+ if ((src_offset < dst_offset && src_offset + cb > dst_offset) ||
+ (dst_offset < src_offset && dst_offset + cb > src_offset))
+ {
+ *errcode_ret = CL_MEM_COPY_OVERLAP;
+ return;
+ }
+ }
+
+ // Check alignement of destination
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(destination, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!destination->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+CopyBufferEvent::~CopyBufferEvent()
+{
+ clReleaseMemObject((cl_mem) p_destination);
+}
+
+MemObject *CopyBufferEvent::source() const
+{
+ return buffer();
+}
+
+MemObject *CopyBufferEvent::destination() const
+{
+ return p_destination;
+}
+
+size_t CopyBufferEvent::src_offset() const
+{
+ return p_src_offset;
+}
+
+size_t CopyBufferEvent::dst_offset() const
+{
+ return p_dst_offset;
+}
+
+size_t CopyBufferEvent::cb() const
+{
+ return p_cb;
+}
+
+Event::Type CopyBufferEvent::type() const
+{
+ return Event::CopyBuffer;
+}
+
+/*
+ * Native kernel
+ */
+NativeKernelEvent::NativeKernelEvent(CommandQueue *parent,
+ void (*user_func)(void *),
+ void *args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const MemObject **mem_list,
+ const void **args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event (parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_user_func((void *)user_func), p_args(0)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Parameters sanity
+ if (!user_func)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (!args && (cb_args || num_mem_objects))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (args && !cb_args)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (num_mem_objects && (!mem_list || !args_mem_loc))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (!num_mem_objects && (mem_list || args_mem_loc))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check that the device can execute a native kernel
+ DeviceInterface *device;
+ cl_device_exec_capabilities caps;
+
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ *errcode_ret = device->info(CL_DEVICE_EXECUTION_CAPABILITIES,
+ sizeof(cl_device_exec_capabilities), &caps, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if ((caps & CL_EXEC_NATIVE_KERNEL) == 0)
+ {
+ *errcode_ret = CL_INVALID_OPERATION;
+ return;
+ }
+
+ // Copy the arguments in a new list
+ if (cb_args)
+ {
+ p_args = std::malloc(cb_args);
+
+ if (!p_args)
+ {
+ *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+ return;
+ }
+
+ std::memcpy((void *)p_args, (void *)args, cb_args);
+
+ // Replace memory objects with global pointers
+ for (cl_uint i=0; i<num_mem_objects; ++i)
+ {
+ const MemObject *buffer = mem_list[i];
+ const char *loc = (const char *)args_mem_loc[i];
+
+ if (!buffer)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // We need to do relocation : loc is in args, we need it in p_args
+ size_t delta = (char *)p_args - (char *)args;
+ loc += delta;
+
+ *(void **)loc = buffer->deviceBuffer(device)->nativeGlobalPointer();
+ }
+ }
+}
+
+NativeKernelEvent::~NativeKernelEvent()
+{
+ if (p_args)
+ std::free((void *)p_args);
+}
+
+Event::Type NativeKernelEvent::type() const
+{
+ return Event::NativeKernel;
+}
+
+void *NativeKernelEvent::function() const
+{
+ return p_user_func;
+}
+
+void *NativeKernelEvent::args() const
+{
+ return p_args;
+}
+
+/*
+ * Kernel event
+ */
+KernelEvent::KernelEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_work_dim(work_dim), p_kernel(kernel)
+{
+ clRetainKernel((cl_kernel) p_kernel);
+
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ *errcode_ret = CL_SUCCESS;
+
+ // Sanity checks
+ if (!kernel)
+ {
+ *errcode_ret = CL_INVALID_KERNEL;
+ return;
+ }
+
+ // Check that the kernel was built for parent's device.
+ DeviceInterface *device;
+ Context *k_ctx, *q_ctx;
+ size_t max_work_group_size;
+ cl_uint max_dims = 0;
+
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ *errcode_ret = parent->info(CL_QUEUE_CONTEXT, sizeof(Context *), &q_ctx, 0);
+ *errcode_ret |= kernel->info(CL_KERNEL_CONTEXT, sizeof(Context *), &k_ctx, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+ &max_work_group_size, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(size_t),
+ &max_dims, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES,
+ max_dims * sizeof(size_t), p_max_work_item_sizes, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ p_dev_kernel = kernel->deviceDependentKernel(device);
+
+ if (!p_dev_kernel)
+ {
+ *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE;
+ return;
+ }
+
+ // Check that contexts match
+ if (k_ctx != q_ctx)
+ {
+ *errcode_ret = CL_INVALID_CONTEXT;
+ return;
+ }
+
+ // Check args
+ if (!kernel->argsSpecified())
+ {
+ *errcode_ret = CL_INVALID_KERNEL_ARGS;
+ return;
+ }
+
+ // Check dimension
+ if (work_dim == 0 || work_dim > max_dims)
+ {
+ *errcode_ret = CL_INVALID_WORK_DIMENSION;
+ return;
+ }
+
+ // Populate work_offset, work_size and local_work_size
+ size_t work_group_size = 1;
+ boost::tuple <uint,uint,uint> reqd_work_group_size(
+ kernel->reqdWorkGroupSize(kernel->deviceDependentModule(device)));
+
+ uint reqd_x = reqd_work_group_size.get<0>();
+ uint reqd_y = reqd_work_group_size.get<1>();
+ uint reqd_z = reqd_work_group_size.get<2>();
+ bool reqd_any = reqd_x > 0 || reqd_y > 0 || reqd_z > 0;
+
+ if (reqd_any)
+ {
+ // if __attribute__((reqd_work_group_size(X, Y, Z))) is set and local size not specified
+ if (!local_work_size)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // if __attribute__((reqd_work_group_size(X, Y, Z))) doesn't match
+ else
+ {
+ if (( local_work_size[0] != reqd_x) ||
+ (work_dim > 1 && local_work_size[1] != reqd_y) ||
+ (work_dim > 2 && local_work_size[2] != reqd_z))
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+ }
+ }
+
+ cl_uint i;
+ for (i=0; i<work_dim; ++i)
+ {
+ if (global_work_offset)
+ {
+ p_global_work_offset[i] = global_work_offset[i];
+ }
+ else
+ {
+ p_global_work_offset[i] = 0;
+ }
+
+ if (!global_work_size || !global_work_size[i])
+ {
+ *errcode_ret = CL_INVALID_GLOBAL_WORK_SIZE;
+ }
+ p_global_work_size[i] = global_work_size[i];
+
+ if (!local_work_size)
+ {
+ // Guess the best value according to the device
+ p_local_work_size[i] =
+ p_dev_kernel->guessWorkGroupSize(work_dim, i, global_work_size[i]);
+ }
+ else
+ {
+ // Check divisibility
+ if ((global_work_size[i] % local_work_size[i]) != 0)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // Not too big ?
+ if (local_work_size[i] > p_max_work_item_sizes[i])
+ {
+ *errcode_ret = CL_INVALID_WORK_ITEM_SIZE;
+ return;
+ }
+
+ p_local_work_size[i] = local_work_size[i];
+ work_group_size *= local_work_size[i];
+ }
+ }
+ // initialize missing dimensions
+ for (; i < max_dims; i++)
+ {
+ p_global_work_offset[i] = 0;
+ p_global_work_size[i] = 1;
+ p_local_work_size[i] = 1;
+ }
+
+ // Check we don't ask too much to the device
+ if (work_group_size > max_work_group_size)
+ {
+ *errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
+ }
+
+ // Check arguments (buffer alignment, image size, ...)
+ for (unsigned int i=0; i<kernel->numArgs(); ++i)
+ {
+ const Kernel::Arg *a = kernel->arg(i);
+
+ if (a->kind() == Kernel::Arg::Buffer && a->file() != Kernel::Arg::Local)
+ {
+ const MemObject *buffer = *(const MemObject **)(a->value(0));
+
+ if (!BufferEvent::isSubBufferAligned(buffer, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+ }
+ else if (a->kind() == Kernel::Arg::Image2D)
+ {
+ const Image2D *image = *(const Image2D **)(a->value(0));
+ size_t maxWidth, maxHeight;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ else if (a->kind() == Kernel::Arg::Image3D)
+ {
+ const Image3D *image = *(const Image3D **)a->value(0);
+ size_t maxWidth, maxHeight, maxDepth;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH,
+ sizeof(size_t), &maxDepth, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight ||
+ image->depth() > maxDepth)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ }
+}
+
+KernelEvent::~KernelEvent()
+{
+ clReleaseKernel((cl_kernel) p_kernel);
+}
+
+cl_uint KernelEvent::work_dim() const
+{
+ return p_work_dim;
+}
+
+size_t KernelEvent::global_work_offset(cl_uint dim) const
+{
+ return p_global_work_offset[dim];
+}
+
+size_t KernelEvent::global_work_size(cl_uint dim) const
+{
+ return p_global_work_size[dim];
+}
+
+size_t KernelEvent::local_work_size(cl_uint dim) const
+{
+ return p_local_work_size[dim];
+}
+
+Kernel *KernelEvent::kernel() const
+{
+ return p_kernel;
+}
+
+DeviceKernel *KernelEvent::deviceKernel() const
+{
+ return p_dev_kernel;
+}
+
+Event::Type KernelEvent::type() const
+{
+ return Event::NDRangeKernel;
+}
+
+static size_t one = 1;
+
+TaskEvent::TaskEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: KernelEvent(parent, kernel, 1, 0, &one, &one, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ // TODO: CL_INVALID_WORK_GROUP_SIZE if
+ // __attribute__((reqd_work_group_size(X, Y, Z))) != (1, 1, 1)
+}
+
+Event::Type TaskEvent::type() const
+{
+ return Event::TaskKernel;
+}
+
+/*
+ * User event
+ */
+UserEvent::UserEvent(Context *context, cl_int *errcode_ret)
+: Event(0, Submitted, 0, 0, errcode_ret), p_context(context)
+{}
+
+Event::Type UserEvent::type() const
+{
+ return Event::User;
+}
+
+Context *UserEvent::context() const
+{
+ return p_context;
+}
+
+/*
+ * ReadWriteBufferRectEvent
+ */
+ReadWriteCopyBufferRectEvent::ReadWriteCopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: BufferEvent (parent, source, num_events_in_wait_list, event_wait_list,
+ errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Copy the vectors
+ if (src_origin)
+ std::memcpy(&p_src_origin, src_origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_src_origin, 0, 3 * sizeof(size_t));
+
+ if (dst_origin)
+ std::memcpy(&p_dst_origin, dst_origin, 3 * sizeof(size_t));
+ else
+ std::memset(&p_dst_origin, 0, 3 * sizeof(size_t));
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if (!region[i])
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_region[i] = region[i];
+ }
+
+ // Multiply the elements (for images)
+ p_region[0] *= bytes_per_element;
+ p_src_origin[0] *= bytes_per_element;
+ p_dst_origin[0] *= bytes_per_element;
+
+ // Compute the pitches
+ p_src_row_pitch = p_region[0];
+
+ if (src_row_pitch)
+ {
+ if (src_row_pitch < p_src_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_src_row_pitch = src_row_pitch;
+ }
+
+ p_src_slice_pitch = p_region[1] * p_src_row_pitch;
+
+ if (src_slice_pitch)
+ {
+ if (src_slice_pitch < p_src_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_src_slice_pitch = src_slice_pitch;
+ }
+
+ p_dst_row_pitch = p_region[0];
+
+ if (dst_row_pitch)
+ {
+ if (dst_row_pitch < p_dst_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_dst_row_pitch = dst_row_pitch;
+ }
+
+ p_dst_slice_pitch = p_region[1] * p_dst_row_pitch;
+
+ if (dst_slice_pitch)
+ {
+ if (dst_slice_pitch < p_dst_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ p_dst_slice_pitch = dst_slice_pitch;
+ }
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_origin(unsigned int index) const
+{
+ return p_src_origin[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_origin(unsigned int index) const
+{
+ return p_dst_origin[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::region(unsigned int index) const
+{
+ return p_region[index];
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_row_pitch() const
+{
+ return p_src_row_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::src_slice_pitch() const
+{
+ return p_src_slice_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_row_pitch() const
+{
+ return p_dst_row_pitch;
+}
+
+size_t ReadWriteCopyBufferRectEvent::dst_slice_pitch() const
+{
+ return p_dst_slice_pitch;
+}
+
+MemObject *ReadWriteCopyBufferRectEvent::source() const
+{
+ return buffer();
+}
+
+CopyBufferRectEvent::CopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteCopyBufferRectEvent(parent, source, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch,
+ dst_slice_pitch, bytes_per_element,
+ num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_destination(destination)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!destination)
+ {
+ *errcode_ret = CL_INVALID_MEM_OBJECT;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch ||
+ (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch ||
+ (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > source->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((p_dst_origin[0] + p_region[0]) > p_dst_row_pitch ||
+ (p_dst_origin[1] + p_region[1]) * p_dst_row_pitch > p_dst_slice_pitch ||
+ (p_dst_origin[2] + p_region[2]) * p_dst_slice_pitch > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for overlapping
+ if (source == destination)
+ {
+ unsigned char overlapping_dimensions = 0;
+
+ for (unsigned int i=0; i<3; ++i)
+ {
+ if ((p_dst_origin[i] < p_src_origin[i] && p_dst_origin[i] + p_region[i] > p_src_origin[i]) ||
+ (p_src_origin[i] < p_dst_origin[i] && p_src_origin[i] + p_region[i] > p_dst_origin[i]))
+ overlapping_dimensions++;
+ }
+
+ if (overlapping_dimensions == 3)
+ {
+ // If all the dimensions are overlapping, the region is overlapping
+ *errcode_ret = CL_MEM_COPY_OVERLAP;
+ return;
+ }
+ }
+
+ // Check alignment of destination (source already checked by BufferEvent)
+ DeviceInterface *device = 0;
+ *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
+ &device, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (!isSubBufferAligned(destination, device))
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+
+ // Allocate the buffer for the device
+ if (!destination->allocate(device))
+ {
+ *errcode_ret = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ return;
+ }
+}
+
+Event::Type CopyBufferRectEvent::type() const
+{
+ return Event::CopyBufferRect;
+}
+
+MemObject *CopyBufferRectEvent::destination() const
+{
+ return p_destination;
+}
+
+ReadWriteBufferRectEvent::ReadWriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteCopyBufferRectEvent(parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch,
+ host_row_pitch, host_slice_pitch, bytes_per_element,
+ num_events_in_wait_list, event_wait_list, errcode_ret),
+ p_ptr(ptr)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (!ptr)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check for out-of-bounds
+ if ((p_src_origin[0] + p_region[0]) > p_src_row_pitch ||
+ (p_src_origin[1] + p_region[1]) * p_src_row_pitch > p_src_slice_pitch ||
+ (p_src_origin[2] + p_region[2]) * p_src_slice_pitch > buffer->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+void *ReadWriteBufferRectEvent::ptr() const
+{
+ return p_ptr;
+}
+
+ReadBufferRectEvent::ReadBufferRectEvent (CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent(parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
+ host_slice_pitch, ptr, 1, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+}
+
+Event::Type ReadBufferRectEvent::type() const
+{
+ return ReadBufferRect;
+}
+
+WriteBufferRectEvent::WriteBufferRectEvent (CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent (parent, buffer, buffer_origin, host_origin, region,
+ buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
+ host_slice_pitch, ptr, 1, num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+}
+
+Event::Type WriteBufferRectEvent::type() const
+{
+ return WriteBufferRect;
+}
+
+ReadWriteImageEvent::ReadWriteImageEvent (CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteBufferRectEvent(parent, image, origin, 0, region, image->row_pitch(),
+ image->slice_pitch(), row_pitch, slice_pitch, ptr,
+ image->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ if (image->type() == MemObject::Image2D &&
+ (origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+ReadImageEvent::ReadImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteImageEvent(parent, image, origin, region, row_pitch, slice_pitch, ptr,
+ num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type ReadImageEvent::type() const
+{
+ return Event::ReadImage;
+}
+
+WriteImageEvent::WriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: ReadWriteImageEvent (parent, image, origin, region, row_pitch, slice_pitch, ptr,
+ num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type WriteImageEvent::type() const
+{
+ return Event::WriteImage;
+}
+
+static bool operator!=(const cl_image_format &a, const cl_image_format &b)
+{
+ return (a.image_channel_data_type != b.image_channel_data_type) ||
+ (a.image_channel_order != b.image_channel_order);
+}
+
+CopyImageEvent::CopyImageEvent(CommandQueue *parent,
+ Image2D *source,
+ Image2D *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent (parent, source, destination, src_origin, dst_origin,
+ region, source->row_pitch(), source->slice_pitch(),
+ destination->row_pitch(), destination->slice_pitch(),
+ source->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check bounds
+ if (source->type() == MemObject::Image2D &&
+ (src_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if (destination->type() == MemObject::Image2D &&
+ (dst_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Formats must match
+ if (source->format() != destination->format())
+ {
+ *errcode_ret = CL_IMAGE_FORMAT_MISMATCH;
+ return;
+ }
+}
+
+Event::Type CopyImageEvent::type() const
+{
+ return Event::CopyImage;
+}
+
+CopyImageToBufferEvent::CopyImageToBufferEvent(CommandQueue *parent,
+ Image2D *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t region[3],
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent(parent, source, destination, src_origin, 0, region,
+ source->row_pitch(), source->slice_pitch(), 0, 0,
+ source->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret),
+ p_offset(dst_offset)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for buffer overflow
+ size_t dst_cb = region[2] * region[1] * region[0] * source->pixel_size();
+
+ if (dst_offset + dst_cb > destination->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check validity
+ if (source->type() == MemObject::Image2D &&
+ (src_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t CopyImageToBufferEvent::offset() const
+{
+ return p_offset;
+}
+
+Event::Type CopyImageToBufferEvent::type() const
+{
+ return Event::CopyImageToBuffer;
+}
+
+CopyBufferToImageEvent::CopyBufferToImageEvent(CommandQueue *parent,
+ MemObject *source,
+ Image2D *destination,
+ size_t src_offset,
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: CopyBufferRectEvent(parent, source, destination, 0, dst_origin, region, 0, 0,
+ destination->row_pitch(), destination->slice_pitch(),
+ destination->pixel_size(), num_events_in_wait_list,
+ event_wait_list, errcode_ret),
+ p_offset(src_offset)
+{
+ if (*errcode_ret != CL_SUCCESS) return;
+
+ // Check for buffer overflow
+ size_t src_cb = region[2] * region[1] * region[0] * destination->pixel_size();
+
+ if (src_offset + src_cb > source->size())
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check validity
+ if (destination->type() == MemObject::Image2D &&
+ (dst_origin[2] != 0 || region[2] != 1))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+}
+
+size_t CopyBufferToImageEvent::offset() const
+{
+ return p_offset;
+}
+
+Event::Type CopyBufferToImageEvent::type() const
+{
+ return Event::CopyBufferToImage;
+}
+
+/*
+ * Barrier
+ */
+
+BarrierEvent::BarrierEvent(CommandQueue *parent, cl_int *errcode_ret)
+: Event(parent, Queued, 0, 0, errcode_ret)
+{}
+
+Event::Type BarrierEvent::type() const
+{
+ return Event::Barrier;
+}
+
+/*
+ * WaitForEvents
+ */
+
+WaitForEventsEvent::WaitForEventsEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type WaitForEventsEvent::type() const
+{
+ return Event::WaitForEvents;
+}
+
+/*
+ * Marker
+ */
+MarkerEvent::MarkerEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret)
+: WaitForEventsEvent(parent, num_events_in_wait_list, event_wait_list, errcode_ret)
+{}
+
+Event::Type MarkerEvent::type() const
+{
+ return Event::Marker;
+}
diff --git a/src/core/events.h b/src/core/events.h
new file mode 100644
index 0000000..2311d92
--- /dev/null
+++ b/src/core/events.h
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file events.h
+ * \brief All the event-related classes
+ */
+
+#ifndef __EVENTS_H__
+#define __EVENTS_H__
+
+#include "commandqueue.h"
+#include <core/config.h>
+
+#include <vector>
+
+namespace Coal
+{
+
+class MemObject;
+class Image2D;
+class Kernel;
+class DeviceKernel;
+class DeviceInterface;
+
+/**
+ * \brief Buffer-related event
+ */
+class BufferEvent : public Event
+{
+ public:
+ BufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ virtual ~BufferEvent();
+
+ MemObject *buffer() const; /*!< \brief Buffer on which to operate */
+
+ /**
+ * \brief Check that a buffer is correctly aligned for a device
+ *
+ * OpenCL supports sub-buffers of buffers (\c Coal::SubBuffer). They
+ * have to be aligned on a certain device-dependent boundary.
+ *
+ * This function checks that \p buffer is correctly aligned for
+ * \p device. If \p buffer is not a \c Coal::SubBuffer, this function
+ * returns true.
+ *
+ * \return true if the buffer is aligned or not a \c Coal::SubBuffer
+ */
+ static bool isSubBufferAligned(const MemObject *buffer,
+ const DeviceInterface *device);
+
+ private:
+ MemObject *p_buffer;
+};
+
+/**
+ * \brief Reading or writing to a buffer
+ */
+class ReadWriteBufferEvent : public BufferEvent
+{
+ public:
+ ReadWriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer of the operation, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to read or write */
+ void *ptr() const; /*!< \brief Pointer in host memory at which to put the data */
+
+ private:
+ size_t p_offset, p_cb;
+ void *p_ptr;
+};
+
+/**
+ * \brief Reading a buffer
+ */
+class ReadBufferEvent : public ReadWriteBufferEvent
+{
+ public:
+ ReadBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBuffer one */
+};
+
+/**
+ * \brief Writing a buffer
+ */
+class WriteBufferEvent : public ReadWriteBufferEvent
+{
+ public:
+ WriteBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBuffer one */
+};
+
+/**
+ * \brief Mapping a buffer
+ */
+class MapBufferEvent : public BufferEvent
+{
+ public:
+ MapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ size_t offset,
+ size_t cb,
+ cl_map_flags map_flags,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapBuffer one */
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which the mapping begins, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to map */
+ cl_map_flags flags() const; /*!< \brief Flags of the mapping */
+ void *ptr() const; /*!< \brief Pointer at which the data has been mapped */
+
+ /**
+ * \brief Set the memory location at which the data has been mapped
+ *
+ * This function is called by the device when it has successfully mapped
+ * the buffer. It must be called inside
+ * \c Coal::DeviceInterface::initEventDeviceData().
+ *
+ * \param ptr the address at which the buffer has been mapped
+ */
+ void setPtr(void *ptr);
+
+ private:
+ size_t p_offset, p_cb;
+ cl_map_flags p_map_flags;
+ void *p_ptr;
+};
+
+/**
+ * \brief Mapping an image
+ */
+class MapImageEvent : public BufferEvent
+{
+ public:
+ MapImageEvent(CommandQueue *parent,
+ Image2D *image,
+ cl_map_flags map_flags,
+ const size_t origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::MapImage one */
+
+ /**
+ * \brief Origin of the mapping, in pixels, for the given dimension
+ * \param index dimension for which the origin is retrieved
+ * \return origin of the mapping for the given dimension
+ */
+ size_t origin(unsigned int index) const;
+
+ /**
+ * \brief Region of the mapping, in pixels, for the given dimension
+ * \param index dimension for which the region is retrieved
+ * \return region of the mapping for the given dimension
+ */
+ size_t region(unsigned int index) const;
+ cl_map_flags flags() const; /*!< \brief Flags of the mapping */
+
+ void *ptr() const; /*!< \brief Pointer at which the data is mapped */
+ size_t row_pitch() const; /*!< \brief Row pitch of the mapped data */
+ size_t slice_pitch() const; /*!< \brief Slice pitch of the mapped data */
+
+ /**
+ * \brief Set the memory location at which the image is mapped
+ *
+ * This function must be called by
+ * \c Coal::DeviceInterface::initEventDeviceData(). Row and slice pitches
+ * must also be set by this function by calling \c setRowPitch() and
+ * \c setSlicePitch().
+ *
+ * \param ptr pointer at which the data is available
+ */
+ void setPtr(void *ptr);
+ void setRowPitch(size_t row_pitch); /*!< \brief Set row pitch */
+ void setSlicePitch(size_t slice_pitch); /*!< \brief Set slice pitch */
+
+ private:
+ cl_map_flags p_map_flags;
+ size_t p_origin[3], p_region[3];
+ void *p_ptr;
+ size_t p_slice_pitch, p_row_pitch;
+};
+
+/**
+ * \brief Unmapping a memory object
+ */
+class UnmapBufferEvent : public BufferEvent
+{
+ public:
+ UnmapBufferEvent(CommandQueue *parent,
+ MemObject *buffer,
+ void *mapped_addr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::UnmapBuffer one */
+
+ void *mapping() const; /*!< \brief Mapped address to unmap */
+
+ private:
+ void *p_mapping;
+};
+
+/**
+ * \brief Copying between two buffers
+ */
+class CopyBufferEvent : public BufferEvent
+{
+ public:
+ CopyBufferEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~CopyBufferEvent();
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBuffer one */
+
+ MemObject *source() const; /*!< \brief Source buffer, equivalent to \c Coal::BufferEvent::buffer() */
+ MemObject *destination() const; /*!< \brief Destination buffer */
+ size_t src_offset() const; /*!< \brief Offset in the source buffer, in bytes */
+ size_t dst_offset() const; /*!< \brief Offset in the destination buffer, in bytes */
+ size_t cb() const; /*!< \brief Number of bytes to copy */
+
+ private:
+ MemObject *p_destination;
+ size_t p_src_offset, p_dst_offset, p_cb;
+};
+
+/**
+ * \brief Events related to rectangular (or cubic) memory regions
+ *
+ * This event is the base for all the *BufferRect events, and the Image ones.
+ */
+class ReadWriteCopyBufferRectEvent : public BufferEvent
+{
+ public:
+ ReadWriteCopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t src_origin(unsigned int index) const; /*!< \brief Source origin for the \p index dimension */
+ size_t dst_origin(unsigned int index) const; /*!< \brief Destination origin for the \p index dimension */
+ size_t region(unsigned int index) const; /*!< \brief Region to copy for the \p index dimension */
+ size_t src_row_pitch() const; /*!< \brief Source row pitch */
+ size_t src_slice_pitch() const; /*!< \brief Source slice pitch */
+ size_t dst_row_pitch() const; /*!< \brief Destination row pitch */
+ size_t dst_slice_pitch() const; /*!< \brief Destination slice pitch */
+ MemObject *source() const; /*!< \brief Source of the copy, for readability. Calls \c Coal::BufferEvent::buffer(). */
+
+ protected:
+ size_t p_src_origin[3], p_dst_origin[3], p_region[3];
+ size_t p_src_row_pitch, p_src_slice_pitch;
+ size_t p_dst_row_pitch, p_dst_slice_pitch;
+};
+
+/**
+ * \brief Copying between two buffers
+ */
+class CopyBufferRectEvent : public ReadWriteCopyBufferRectEvent
+{
+ public:
+ CopyBufferRectEvent(CommandQueue *parent,
+ MemObject *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferRect one */
+ MemObject *destination() const; /*!< \brief Destination buffer */
+
+ private:
+ MemObject *p_destination;
+};
+
+/**
+ * \brief Reading or writing to a buffer
+ */
+class ReadWriteBufferRectEvent : public ReadWriteCopyBufferRectEvent
+{
+ public:
+ ReadWriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ unsigned int bytes_per_element,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ void *ptr() const; /*!< \brief Pointer in host memory in which to put the data */
+
+ private:
+ void *p_ptr;
+};
+
+/**
+ * \brief Reading a buffer
+ */
+class ReadBufferRectEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ ReadBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadBufferRect one */
+};
+
+/**
+ * \brief Writing a buffer
+ */
+class WriteBufferRectEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ WriteBufferRectEvent(CommandQueue *parent,
+ MemObject *buffer,
+ const size_t buffer_origin[3],
+ const size_t host_origin[3],
+ const size_t region[3],
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteBufferRect one */
+};
+
+/**
+ * \brief Reading or writing images
+ *
+ * This class only converts some of the arguments given to its constructor
+ * to the one of \c Coal::ReadWriteBufferRectEvent. For example, the source row
+ * and slice pitches are read from the \c Coal::Image2D object.
+ */
+class ReadWriteImageEvent : public ReadWriteBufferRectEvent
+{
+ public:
+ ReadWriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+};
+
+/**
+ * \brief Reading an image
+ */
+class ReadImageEvent : public ReadWriteImageEvent
+{
+ public:
+ ReadImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::ReadImage one */
+};
+
+/**
+ * \brief Writing to an image
+ */
+class WriteImageEvent : public ReadWriteImageEvent
+{
+ public:
+ WriteImageEvent(CommandQueue *parent,
+ Image2D *image,
+ const size_t origin[3],
+ const size_t region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void *ptr,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::WriteImage one */
+};
+
+/**
+ * \brief Copying between two images
+ */
+class CopyImageEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyImageEvent(CommandQueue *parent,
+ Image2D *source,
+ Image2D *destination,
+ const size_t src_origin[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImage one */
+};
+
+/**
+ * \brief Copying an image to a buffer
+ */
+class CopyImageToBufferEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyImageToBufferEvent(CommandQueue *parent,
+ Image2D *source,
+ MemObject *destination,
+ const size_t src_origin[3],
+ const size_t region[3],
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which writing the image */
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyImageToBuffer one */
+
+ private:
+ size_t p_offset;
+};
+
+/**
+ * \brief Copying a buffer to an image
+ */
+class CopyBufferToImageEvent : public CopyBufferRectEvent
+{
+ public:
+ CopyBufferToImageEvent(CommandQueue *parent,
+ MemObject *source,
+ Image2D *destination,
+ size_t src_offset,
+ const size_t dst_origin[3],
+ const size_t region[3],
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ size_t offset() const; /*!< \brief Offset in the buffer at which the copy starts */
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::CopyBufferToImage one */
+
+ private:
+ size_t p_offset;
+};
+
+/**
+ * \brief Executing a native function as a kernel
+ *
+ * This event builds an argument list to give to the native function. It needs
+ * for example to replace all occurence of a \c Coal::MemObject by a pointer
+ * to data the host CPU can actually access, using
+ * \c Coal::DeviceBuffer::nativeGlobalPointer().
+ */
+class NativeKernelEvent : public Event
+{
+ public:
+ NativeKernelEvent(CommandQueue *parent,
+ void (*user_func)(void *),
+ void *args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const MemObject **mem_list,
+ const void **args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~NativeKernelEvent();
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::NativeKernel one */
+
+ void *function() const; /*!< \brief Host function to call */
+ void *args() const; /*!< \brief Args to give to the host function */
+
+ private:
+ void *p_user_func;
+ void *p_args;
+};
+
+/**
+ * \brief Executing a compiled kernel
+ */
+class KernelEvent : public Event
+{
+ public:
+ KernelEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+ ~KernelEvent();
+
+ cl_uint work_dim() const; /*!< \brief Number of working dimensions */
+ size_t global_work_offset(cl_uint dim) const; /*!< \brief Global work offset for the \p dim dimension */
+ size_t global_work_size(cl_uint dim) const; /*!< \brief Global work size for the \p dim dimension */
+ size_t local_work_size(cl_uint dim) const; /*!< \brief Number of work-items per work-group for the \p dim dimension */
+ Kernel *kernel() const; /*!< \brief \c Coal::Kernel object to run */
+ DeviceKernel *deviceKernel() const; /*!< \brief \c Coal::DeviceKernel for the kernel and device of this event */
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::NDRangeKernel one */
+
+ private:
+ cl_uint p_work_dim;
+ size_t p_global_work_offset[MAX_WORK_DIMS],
+ p_global_work_size[MAX_WORK_DIMS],
+ p_local_work_size[MAX_WORK_DIMS],
+ p_max_work_item_sizes[MAX_WORK_DIMS];
+ Kernel *p_kernel;
+ DeviceKernel *p_dev_kernel;
+};
+
+/**
+ * \brief Executing a task kernel
+ *
+ * This event is simple a \c Coal::KernelEvent with:
+ *
+ * - \c work_dim() set to 1
+ * - \c global_work_offset() set to {0}
+ * - \c global_work_size() set to {1}
+ * - \c local_work_size() set to {1}
+ *
+ * It's in fact a \c Coal::KernelEvent containing only one single work-item.
+ */
+class TaskEvent : public KernelEvent
+{
+ public:
+ TaskEvent(CommandQueue *parent,
+ Kernel *kernel,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::TaskKernel one */
+};
+
+/**
+ * \brief User event
+ *
+ * This event is a bit special as it is created by a call to
+ * \c clCreateUserEvent() and doesn't belong to an event queue. Thus, a mean had
+ * to be found for all to work.
+ *
+ * The solution is the \c addDependentCommandQueue() function, called every time
+ * the user event is added to a command queue. When this event becomes completed,
+ * \c flushQueues() is called to allow all the \c Coal::CommandQueue objects
+ * containing this event to push more events on their device.
+ *
+ * This way, command queues are not blocked by user events.
+ */
+class UserEvent : public Event
+{
+ public:
+ UserEvent(Context *context, cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::User one */
+ Context *context() const; /*!< \brief Context of this event */
+
+ private:
+ Context *p_context;
+};
+
+/**
+ * \brief Barrier event
+ */
+class BarrierEvent : public Event
+{
+ public:
+ BarrierEvent(CommandQueue *parent,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::Barrier one */
+};
+
+/**
+ * \brief Event waiting for others to complete before being completed
+ */
+class WaitForEventsEvent : public Event
+{
+ public:
+ WaitForEventsEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ virtual Type type() const; /*!< \brief Say the event is a \c Coal::Event::WaitForEvents one */
+};
+
+/**
+ * \brief Marker event
+ */
+class MarkerEvent : public WaitForEventsEvent
+{
+ public:
+ MarkerEvent(CommandQueue *parent,
+ cl_uint num_events_in_wait_list,
+ const Event **event_wait_list,
+ cl_int *errcode_ret);
+
+ Type type() const; /*!< \brief Say the event is a \c Coal::Event::Marker one */
+};
+
+}
+
+#endif
diff --git a/src/core/icd.cpp b/src/core/icd.cpp
new file mode 100644
index 0000000..2c62035
--- /dev/null
+++ b/src/core/icd.cpp
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "CL/cl.h"
+#include "platform.h"
+#include "icd.h"
+
+void * dispatch_table[] =
+{
+ (void*) clGetPlatformIDs,
+ (void*) clGetPlatformInfo,
+ (void*) clGetDeviceIDs,
+ (void*) clGetDeviceInfo,
+ (void*) clCreateContext,
+ (void*) clCreateContextFromType,
+ (void*) clRetainContext,
+ (void*) clReleaseContext,
+ (void*) clGetContextInfo,
+ (void*) clCreateCommandQueue,
+ (void*) clRetainCommandQueue,
+ (void*) clReleaseCommandQueue,
+ (void*) clGetCommandQueueInfo,
+ (void*) 0, //clSetCommandQueueProperty,
+ (void*) clCreateBuffer,
+ (void*) clCreateImage2D,
+ (void*) clCreateImage3D,
+ (void*) clRetainMemObject,
+ (void*) clReleaseMemObject,
+ (void*) clGetSupportedImageFormats,
+ (void*) clGetMemObjectInfo,
+ (void*) clGetImageInfo,
+ (void*) clCreateSampler,
+ (void*) clRetainSampler,
+ (void*) clReleaseSampler,
+ (void*) clGetSamplerInfo,
+ (void*) clCreateProgramWithSource,
+ (void*) clCreateProgramWithBinary,
+ (void*) clRetainProgram,
+ (void*) clReleaseProgram,
+ (void*) clBuildProgram,
+ (void*) clUnloadCompiler,
+ (void*) clGetProgramInfo,
+ (void*) clGetProgramBuildInfo,
+ (void*) clCreateKernel,
+ (void*) clCreateKernelsInProgram,
+ (void*) clRetainKernel,
+ (void*) clReleaseKernel,
+ (void*) clSetKernelArg,
+ (void*) clGetKernelInfo,
+ (void*) clGetKernelWorkGroupInfo,
+ (void*) clWaitForEvents,
+ (void*) clGetEventInfo,
+ (void*) clRetainEvent,
+ (void*) clReleaseEvent,
+ (void*) clGetEventProfilingInfo,
+ (void*) clFlush,
+ (void*) clFinish,
+ (void*) clEnqueueReadBuffer,
+ (void*) clEnqueueWriteBuffer,
+ (void*) clEnqueueCopyBuffer,
+ (void*) clEnqueueReadImage,
+ (void*) clEnqueueWriteImage,
+ (void*) clEnqueueCopyImage,
+ (void*) clEnqueueCopyImageToBuffer,
+ (void*) clEnqueueCopyBufferToImage,
+ (void*) clEnqueueMapBuffer,
+ (void*) clEnqueueMapImage,
+ (void*) clEnqueueUnmapMemObject,
+ (void*) clEnqueueNDRangeKernel,
+ (void*) clEnqueueTask,
+ (void*) clEnqueueNativeKernel,
+ (void*) clEnqueueMarker,
+ (void*) clEnqueueWaitForEvents,
+ (void*) clEnqueueBarrier,
+ (void*) clGetExtensionFunctionAddress,
+ (void*) 0, //clCreateFromGLBuffer,
+ (void*) 0, //clCreateFromGLTexture2D,
+ (void*) 0, //clCreateFromGLTexture3D,
+ (void*) 0, //clCreateFromGLRenderbuffer,
+ (void*) 0, //clGetGLObjectInfo,
+ (void*) 0, //clGetGLTextureInfo,
+ (void*) 0, //clEnqueueAcquireGLObjects,
+ (void*) 0, //clEnqueueReleaseGLObjects,
+ (void*) 0, //clGetGLContextInfoKHR,
+ (void*) 0, //clGetDeviceIDsFromD3D10KHR,
+ (void*) 0, //clCreateFromD3D10BufferKHR,
+ (void*) 0, //clCreateFromD3D10Texture2DKHR,
+ (void*) 0, //clCreateFromD3D10Texture3DKHR,
+ (void*) 0, //clEnqueueAcquireD3D10ObjectsKHR,
+ (void*) 0, //clEnqueueReleaseD3D10ObjectsKHR,
+ (void*) clSetEventCallback,
+ (void*) clCreateSubBuffer,
+ (void*) clSetMemObjectDestructorCallback,
+ (void*) clCreateUserEvent,
+ (void*) clSetUserEventStatus,
+ (void*) clEnqueueReadBufferRect,
+ (void*) clEnqueueWriteBufferRect,
+ (void*) clEnqueueCopyBufferRect,
+ (void*) 0, //clCreateSubDevicesEXT,
+ (void*) 0, //clRetainDeviceEXT,
+ (void*) 0, //clReleaseDeviceEXT
+};
+
+
+cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if (num_platforms) *num_platforms = 1;
+ else if (!platforms) return CL_INVALID_VALUE;
+
+ if (!num_entries && platforms) return CL_INVALID_VALUE;
+
+ /*-------------------------------------------------------------------------
+ * Only one "default" platform
+ *------------------------------------------------------------------------*/
+ if (platforms != 0) *platforms = &the_platform;
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/icd.h b/src/core/icd.h
new file mode 100644
index 0000000..591aed6
--- /dev/null
+++ b/src/core/icd.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * Copyright (c) 2011-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _ICD_H
+#define _ICD_H
+#include "CL/cl.h"
+
+typedef void *(KHRicdVendorDispatch)[];
+extern KHRicdVendorDispatch dispatch_table;
+
+class Dispatch
+{
+ public:
+ Dispatch() : dispatch(&dispatch_table) {}
+ private:
+ KHRicdVendorDispatch *dispatch;
+};
+
+#endif // _ICD_H
+
diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp
new file mode 100644
index 0000000..4c53576
--- /dev/null
+++ b/src/core/kernel.cpp
@@ -0,0 +1,637 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/kernel.cpp
+ * \brief Kernel
+ */
+
+#include "kernel.h"
+#include "propertylist.h"
+#include "program.h"
+#include "memobject.h"
+#include "sampler.h"
+#include "deviceinterface.h"
+
+#include <string>
+#include <iostream>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <boost/tuple/tuple.hpp>
+
+#include <llvm/Support/Casting.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/DataLayout.h>
+
+
+using namespace Coal;
+Kernel::Kernel(Program *program)
+: Object(Object::T_Kernel, program), p_has_locals(false), wi_alloca_size(0)
+{
+ // TODO: Say a kernel is attached to the program (that becomes unalterable)
+
+ null_dep.device = 0;
+ null_dep.kernel = 0;
+ null_dep.function = 0;
+ null_dep.module = 0;
+ p_name = "";
+}
+
+Kernel::~Kernel()
+{
+ while (p_device_dependent.size())
+ {
+ DeviceDependent &dep = p_device_dependent.back();
+
+ delete dep.kernel;
+
+ p_device_dependent.pop_back();
+ }
+}
+
+const Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device) const
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return null_dep;
+}
+
+Kernel::DeviceDependent &Kernel::deviceDependent(DeviceInterface *device)
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return null_dep;
+}
+
+/******************************************************************************
+* cl_int Kernel::addFunction
+******************************************************************************/
+cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
+ llvm::Module *module)
+{
+ llvm::DataLayout TD(module);
+
+#if 0 // Uncomment to see the Function IR being generated:
+ function->dump();
+#endif
+
+ p_name = function->getName().str();
+
+ // Get wi_alloca_size, to be used for computing wg_alloca_size
+ std::string fattrs = function->getAttributes().getAsString(
+ llvm::AttributeSet::FunctionIndex);
+ std::size_t found = fattrs.find("_wi_alloca_size=");
+ if (found != std::string::npos)
+ wi_alloca_size = atoi(fattrs.data() + found + 16);
+
+ /*-------------------------------------------------------------------------
+ * Add a device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent dep;
+
+ dep.device = device;
+ dep.function = function;
+ dep.module = module;
+
+ /*-------------------------------------------------------------------------
+ * Build the arg list of the kernel (or verify it if a previous function
+ * was already registered)
+ *------------------------------------------------------------------------*/
+ llvm::FunctionType *f = function->getFunctionType();
+ bool append = (p_args.size() == 0);
+
+ if (!append && p_args.size() != f->getNumParams())
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ int i = 0;
+ for (llvm::Function::arg_iterator I = function->arg_begin(),
+ E = function->arg_end(); I != E; ++I, i++)
+ {
+ llvm::Type *param_type = f->getParamType(i);
+ llvm::Argument *arg = I;
+ Arg::Kind kind = Arg::Invalid;
+ Arg::File file = Arg::Private;
+ unsigned short vec_dim = 1;
+
+ llvm::Type *arg_type = arg->getType();
+ const unsigned arg_store_size = TD.getTypeStoreSize(arg_type);
+
+ // LLVM IR writes parameters passed by value as pointers:
+ if (llvm::isa<llvm::PointerType>(arg_type) && arg->hasByValAttr()) {
+ arg_type = llvm::dyn_cast<llvm::PointerType>(arg_type)->getElementType();
+ }
+
+ llvm::Type *itype = TD.getSmallestLegalIntType(module->getContext(), arg_store_size * 8);
+ llvm::Type *target_type = (itype != NULL && arg_type->isIntegerTy()) ? itype : arg_type;
+
+ unsigned target_size = TD.getTypeStoreSize(target_type);
+ unsigned target_align = TD.getABITypeAlignment(target_type);
+
+#if 0 // Uncomment to see arg info
+ arg_type->dump(); std::cout << " Size: " << target_size << " Align: " << target_align << std::endl ;
+#endif
+
+ if (arg_type->isPointerTy())
+ {
+ // It's a pointer, dereference it
+ llvm::PointerType *p_type = llvm::cast<llvm::PointerType>(arg_type);
+
+ file = (Arg::File)p_type->getAddressSpace();
+ arg_type = p_type->getElementType();
+
+ // If it's a __local argument, we'll have to allocate memory at run time
+ if (file == Arg::Local)
+ p_has_locals = true;
+
+ kind = Arg::Buffer;
+
+ // If it's a struct, get its name
+ if (arg_type->isStructTy())
+ {
+ llvm::StructType *struct_type =
+ llvm::cast<llvm::StructType>(arg_type);
+ std::string struct_name = struct_type->getName().str();
+
+ if (struct_name.compare(0, 14, "struct.image2d") == 0)
+ {
+ kind = Arg::Image2D;
+ file = Arg::Global;
+ }
+ else if (struct_name.compare(0, 14, "struct.image3d") == 0)
+ {
+ kind = Arg::Image3D;
+ file = Arg::Global;
+ }
+ }
+ }
+ else
+ {
+ if (arg_type->isVectorTy())
+ {
+ // It's a vector, we need its element's type
+ llvm::VectorType *v_type = llvm::cast<llvm::VectorType>(arg_type);
+
+ vec_dim = v_type->getNumElements();
+ arg_type = v_type->getElementType();
+ }
+
+ // Get type kind
+ if (arg_type->isFloatTy())
+ {
+ kind = Arg::Float;
+ }
+ else if (arg_type->isDoubleTy())
+ {
+ kind = Arg::Double;
+ }
+ else if (arg_type->isIntegerTy())
+ {
+ llvm::IntegerType *i_type = llvm::cast<llvm::IntegerType>(arg_type);
+
+ if (i_type->getBitWidth() == 8)
+ {
+ kind = Arg::Int8;
+ }
+ else if (i_type->getBitWidth() == 16)
+ {
+ kind = Arg::Int16;
+ }
+ else if (i_type->getBitWidth() == 32)
+ {
+ // NOTE: May also be a sampler, check done in setArg
+ kind = Arg::Int32;
+ }
+ else if (i_type->getBitWidth() == 64)
+ {
+ kind = Arg::Int64;
+ }
+ }
+ }
+
+ // Check if we recognized the type
+ if (kind == Arg::Invalid)
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ // Create arg
+ Arg *a= new Arg(vec_dim, file, kind, target_align);
+
+ // If we also have a function registered, check for signature compliance
+ if (!append && (a) != p_args[i])
+ return CL_INVALID_KERNEL_DEFINITION;
+
+ // Append arg if needed
+ if (append)
+ p_args.push_back(a);
+ }
+
+ dep.kernel = device->createDeviceKernel(this, dep.function);
+ p_device_dependent.push_back(dep);
+
+ return CL_SUCCESS;
+}
+
+llvm::Function *Kernel::function(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.function;
+}
+
+/******************************************************************************
+* cl_int Kernel::setArg
+******************************************************************************/
+cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
+{
+ if (index > p_args.size())
+ return CL_INVALID_ARG_INDEX;
+
+ Arg *arg = p_args[index];
+
+ /*-------------------------------------------------------------------------
+ * Special case for __local pointers
+ *------------------------------------------------------------------------*/
+ if (arg->file() == Arg::Local)
+ {
+ if (size == 0) return CL_INVALID_ARG_SIZE;
+ if (value != 0) return CL_INVALID_ARG_VALUE;
+
+ arg->setAllocAtKernelRuntime(size);
+ return CL_SUCCESS;
+ }
+
+ /*-------------------------------------------------------------------------
+ * Check that size corresponds to the arg type
+ *------------------------------------------------------------------------*/
+ size_t arg_size = arg->valueSize() * arg->vecDim();
+
+ /*-------------------------------------------------------------------------
+ * Special case for samplers (pointers in C++, uint32 in OpenCL).
+ *------------------------------------------------------------------------*/
+ if (size == sizeof(cl_sampler) && arg_size == 4 &&
+ (*(Object **)value)->isA(T_Sampler))
+ {
+ unsigned int bitfield = (*(Sampler **)value)->bitfield();
+
+ arg->refineKind(Arg::Sampler);
+ arg->alloc();
+ arg->loadData(&bitfield, size);
+
+ return CL_SUCCESS;
+ }
+
+ // LLVM IR redefines function parameter types to fit the smallest integer type width for the ABI
+ // eg: <2xi8> (2 bytes) may actually be pushed as an i32 (4 bytes!), but this knowledge is
+ // not known to shamrock. But, we do know the parameter type alignment in addFunction().
+ // So allow sizes less than or equal to the target alignment to succeed the size test:
+ if ((size != arg_size) && (size > arg->targetAlignment())) return CL_INVALID_ARG_SIZE;
+
+ /*-------------------------------------------------------------------------
+ * Check for null values
+ *------------------------------------------------------------------------*/
+ cl_mem null_mem = 0;
+
+ if (!value)
+ {
+ switch (arg->kind())
+ {
+ /*-------------------------------------------------------------
+ * Special case buffers : value can be 0 (or point to 0)
+ *------------------------------------------------------------*/
+ case Arg::Buffer:
+ case Arg::Image2D:
+ case Arg::Image3D: value = &null_mem;
+ default: return CL_INVALID_ARG_VALUE;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Copy just the data actually passed. Expect LLVM to do the signext/zeroext.
+ *------------------------------------------------------------------------*/
+ arg->alloc();
+ arg->loadData(value, size);
+
+ return CL_SUCCESS;
+}
+
+unsigned int Kernel::numArgs() const
+{
+ return p_args.size();
+}
+
+const Kernel::Arg *Kernel::arg(unsigned int index) const
+{
+ return p_args.at(index);
+}
+
+bool Kernel::argsSpecified() const
+{
+ for (size_t i=0; i<p_args.size(); ++i)
+ if (!p_args[i]->defined()) return false;
+ return true;
+}
+
+bool Kernel::hasLocals() const
+{
+ return p_has_locals;
+}
+
+DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.kernel;
+}
+
+llvm::Module *Kernel::deviceDependentModule(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.module;
+}
+
+cl_int Kernel::info(cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_program cl_program_var;
+ cl_context cl_context_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_KERNEL_FUNCTION_NAME:
+ MEM_ASSIGN(p_name.size() + 1, p_name.c_str());
+ break;
+
+ case CL_KERNEL_NUM_ARGS:
+ SIMPLE_ASSIGN(cl_uint, p_args.size());
+ break;
+
+ case CL_KERNEL_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_KERNEL_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent()->parent());
+ break;
+
+ case CL_KERNEL_PROGRAM:
+ SIMPLE_ASSIGN(cl_program, parent());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+boost::tuple<uint,uint,uint> Kernel::reqdWorkGroupSize(llvm::Module *module) const
+{
+ llvm::NamedMDNode *kernels = module->getNamedMetadata("opencl.kernels");
+
+ boost::tuple<uint,uint,uint> zeros(0,0,0);
+
+ if (!kernels) return zeros;
+
+ for (unsigned int i=0; i<kernels->getNumOperands(); ++i)
+ {
+ llvm::MDNode *node = kernels->getOperand(i);
+
+ /*---------------------------------------------------------------------
+ * Each node has only one operand : a llvm::Function
+ *--------------------------------------------------------------------*/
+ llvm::Value *value = node->getOperand(0);
+
+ /*---------------------------------------------------------------------
+ * Bug somewhere, don't crash
+ *--------------------------------------------------------------------*/
+ if (!llvm::isa<llvm::Function>(value)) continue;
+
+ llvm::Function *f = llvm::cast<llvm::Function>(value);
+ if(f->getName().str() != p_name) continue;
+
+ if (node->getNumOperands() <= 1) return zeros;
+
+ llvm::MDNode *meta = llvm::cast<llvm::MDNode>(node->getOperand(1));
+ if (meta->getNumOperands() == 4 &&
+ meta->getOperand(0)->getName().str() == std::string("reqd_work_group_size"))
+ {
+ uint x = llvm::cast<llvm::ConstantInt> (meta->getOperand(1))->getValue().getLimitedValue();
+ uint y = llvm::cast<llvm::ConstantInt> (meta->getOperand(2))->getValue().getLimitedValue();
+ uint z = llvm::cast<llvm::ConstantInt> (meta->getOperand(3))->getValue().getLimitedValue();
+
+ return boost::tuple<uint,uint,uint> (x,y,z);
+ }
+ return zeros;
+ }
+}
+
+
+cl_int Kernel::workGroupInfo(DeviceInterface *device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ size_t size_t_var;
+ size_t three_size_t[3];
+ cl_ulong cl_ulong_var;
+ };
+
+ const DeviceDependent &dep = deviceDependent(device);
+
+ // BUG? Shouldn't we check if the kernel is associated with
+ // the default device ?
+ if (!device && p_device_dependent.size() > 1)
+ return CL_INVALID_DEVICE;
+
+ switch (param_name)
+ {
+ case CL_KERNEL_WORK_GROUP_SIZE:
+ SIMPLE_ASSIGN(size_t, dep.kernel->workGroupSize());
+ break;
+
+ case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+ {
+ boost::tuple<uint,uint,uint> res(reqdWorkGroupSize(dep.module));
+ three_size_t[0] = res.get<0>();
+ three_size_t[1] = res.get<1>();
+ three_size_t[2] = res.get<2>();
+ value = &three_size_t;
+ value_length = sizeof(three_size_t);
+ }
+ break;
+
+ case CL_KERNEL_LOCAL_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, dep.kernel->localMemSize());
+ break;
+
+ case CL_KERNEL_PRIVATE_MEM_SIZE:
+ SIMPLE_ASSIGN(cl_ulong, dep.kernel->privateMemSize());
+ break;
+
+ case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+ SIMPLE_ASSIGN(size_t, dep.kernel->preferredWorkGroupSizeMultiple());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/*
+ * Kernel::Arg
+ */
+Kernel::Arg::Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align)
+ : p_vec_dim(vec_dim), p_file(file), p_kind(kind), p_targ_align(targ_align), p_data(0), p_defined(false),
+ p_runtime_alloc(0)
+{ }
+
+Kernel::Arg::~Arg()
+{
+ if (p_data) std::free(p_data);
+}
+
+void Kernel::Arg::alloc()
+{
+ if (!p_data) p_data = std::calloc(p_vec_dim, valueSize());
+}
+
+void Kernel::Arg::loadData(const void *data, size_t size)
+{
+ assert ( size <= p_vec_dim * valueSize());
+ std::memcpy(p_data, data, size);
+ p_defined = true;
+}
+
+void Kernel::Arg::setAllocAtKernelRuntime(size_t size)
+{
+ p_runtime_alloc = size;
+ p_defined = true;
+}
+
+void Kernel::Arg::refineKind (Kernel::Arg::Kind kind)
+{
+ p_kind = kind;
+}
+
+bool Kernel::Arg::operator!=(const Arg &b)
+{
+ bool same = (p_vec_dim == b.p_vec_dim) &&
+ (p_file == b.p_file) &&
+ (p_kind == b.p_kind);
+
+ return !same;
+}
+
+size_t Kernel::Arg::valueSize() const
+{
+ switch (p_kind)
+ {
+ case Invalid: return 0;
+ case Int8: return 1;
+ case Int16: return 2;
+ case Int32:
+ case Sampler: return 4;
+ case Int64: return 8;
+ case Float: return sizeof(cl_float);
+ case Double: return sizeof(double);
+ case Buffer:
+ case Image2D:
+ case Image3D: return sizeof(cl_mem);
+ }
+
+ return 0;
+}
+
+unsigned short Kernel::Arg::vecDim() const { return p_vec_dim; }
+Kernel::Arg::File Kernel::Arg::file() const { return p_file; }
+Kernel::Arg::Kind Kernel::Arg::kind() const { return p_kind; }
+size_t Kernel::Arg::targetAlignment() const { return p_targ_align; }
+bool Kernel::Arg::defined() const { return p_defined; }
+const void * Kernel::Arg::data() const { return p_data; }
+size_t Kernel::Arg::allocAtKernelRuntime() const {return p_runtime_alloc;}
+
+const void *Kernel::Arg::value(unsigned short index) const
+{
+ const char *data = (const char *)p_data;
+ unsigned int offset = index * valueSize();
+
+ data += offset;
+
+ return (const void *)data;
+}
+
diff --git a/src/core/kernel.h b/src/core/kernel.h
new file mode 100644
index 0000000..80672ea
--- /dev/null
+++ b/src/core/kernel.h
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/kernel.h
+ * \brief Kernel
+ */
+
+#ifndef __KERNEL_H__
+#define __KERNEL_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+
+#include <vector>
+#include <string>
+#include <boost/tuple/tuple.hpp>
+
+namespace llvm
+{
+ class Function;
+ class Module;
+}
+
+namespace Coal
+{
+
+class Program;
+class DeviceInterface;
+class DeviceKernel;
+
+/**
+ * \brief Kernel
+ *
+ * A kernel represents a LLVM function that can be run on a device. As
+ * \c Coal::Kernel objects are device-independent, they in fact represent only
+ * the name of a kernel and the arguments the application wants to pass to it,
+ * but it also contains a list of LLVM functions for each device for which its
+ * parent \c Coal::Program has been built
+ */
+class Kernel : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param program Parent \c Coal::Program
+ */
+ Kernel(Program *program);
+ ~Kernel();
+
+ /**
+ * \brief Kernel argument
+ *
+ * This class holds OpenCL-related information about the arguments of
+ * a kernel. It is also used to check that a kernel takes the same
+ * arguments on every device on which it has been built.
+ */
+ class Arg
+ {
+ public:
+ /**
+ * \brief Memory address space qualifier
+ */
+ enum File
+ {
+ Private = 0, /*!< \brief __private */
+#if 1
+ Global = 1, /*!< \brief __global */
+ Constant = 2, /*!< \brief __constant */
+ Local = 3 /*!< \brief __local */
+#else
+ /* using clang defaults */
+ Global = 0xFFFF00, /*!< \brief __global */
+ Local = 0xFFFF01, /*!< \brief __local */
+ Constant = 0xFFFF02 /*!< \brief __constant */
+#endif
+ };
+
+ /**
+ * \brief Kind of argument (its datatype)
+ */
+ enum Kind
+ {
+ Invalid, /*!< \brief Invalid argument */
+ Int8, /*!< \brief \c uchar or \c char, \c i8 in LLVM */
+ Int16, /*!< \brief \c ushort or \c short, \c i16 in LLVM */
+ Int32, /*!< \brief \c uint or \c int, \c i32 in LLVM */
+ Int64, /*!< \brief \c ulong or \c long, \c i64 in LLVM */
+ Float, /*!< \brief \c float, \c float in LLVM */
+ Double, /*!< \brief \c double, \c double in LLVM */
+ Buffer, /*!< \brief \c Coal::Buffer or \c Coal::SubBuffer, <tt>type*</tt> in LLVM */
+ Image2D, /*!< \brief \c Coal::Image2D, <tt>\%struct.image2d*</tt> in LLVM */
+ Image3D, /*!< \brief \c Coal::Image3D, <tt>\%struct.image3d*</tt> in LLVM */
+ Sampler /*!< \brief \c Coal::Sampler::bitfield(), \c i32 in LLVM, see \c Coal::Kernel::setArg() */
+ };
+
+ /**
+ * \brief Constructor
+ * \param vec_dim vector dimension of the argument, 1 if not a vector
+ * \param file \c File of the argument
+ * \param kind \c Kind of the argument
+ * \param kind \c Argument type alignment (ABI specific)
+ */
+ Arg(unsigned short vec_dim, File file, Kind kind, size_t targ_align);
+ ~Arg();
+
+ /**
+ * \brief Allocate the argument
+ *
+ * This function must be called before \c loadData(). It
+ * allocates a buffer in which the argument value can be stored.
+ *
+ * \sa valueSize()
+ */
+ void alloc();
+
+ /**
+ * \brief Load a value into the argument
+ * \note \c alloc() must have been called before this function.
+ * \sa valueSize()
+ */
+ void loadData(const void *data, size_t size);
+
+ /**
+ * \brief Set the number of bytes that must be allocated at run-time
+ *
+ * \c __local arguments don't take a value given by the host
+ * application, but take pointers allocated on the device
+ * for each work-group.
+ *
+ * This function allows to set the size of the device-allocated
+ * memory buffer used by this argument.
+ *
+ * \param size size in byte of the buffer the device has to
+ * allocate for each work-group of this kernel
+ */
+ void setAllocAtKernelRuntime(size_t size);
+
+ /**
+ * \brief Changes the \c Kind of this argument
+ * \param kind new \c Kind
+ */
+ void refineKind(Kind kind);
+
+ /**
+ * \brief Compares this argument with another
+ *
+ * They are different if they \c vec_dim, \c file or \c kind are
+ * not the same.
+ *
+ * \param b other argument to compare
+ * \return true if the this arguments doesn't match \p b
+ */
+ bool operator !=(const Arg &b);
+
+ /**
+ * \brief Size of a field of this arg
+ *
+ * This function returns the size of this argument based on its
+ * \c Kind
+ *
+ * \note This size is not multiplied by \c vecDim(), you must do
+ * this by yourself to find the total space taken by this
+ * arg.
+ * \return the size of this argument, in bytes, without any padding
+ */
+ size_t valueSize() const;
+ unsigned short vecDim() const; /*!< \brief Vector dimension */
+ File file() const; /*!< \brief File */
+ Kind kind() const; /*!< \brief Kind */
+ bool defined() const; /*!< \brief Has the value of this argument already beed loaded by the host application ? */
+ size_t targetAlignment() const; /*!< \brief Get alignment (bytes) of arg type */
+ size_t allocAtKernelRuntime() const; /*!< \brief Size of the \c __local buffer to allocate at kernel runtime */
+ const void *value(unsigned short index) const; /*!< \brief Pointer to the value of this argument, for the \p index vector element */
+ const void *data() const; /*!< \brief Pointer to the data of this arg, equivalent to <tt>value(0)</tt> */
+
+ private:
+ unsigned short p_vec_dim;
+ File p_file;
+ Kind p_kind;
+ void *p_data;
+ bool p_defined;
+ size_t p_runtime_alloc;
+ size_t p_targ_align;
+ };
+
+ /**
+ * \brief Add a \c llvm::Function to this kernel
+ *
+ * This function adds a \c llvm::Function to this kernel for the
+ * specified \p device. It also has the responsibility to find the
+ * \c Arg::Kind of each of the function's arguments.
+ *
+ * LLVM provides a \c llvm::Type for each argument:
+ *
+ * - If it is a pointer, the kind of the argument is \c Arg::Buffer and
+ * its field is a simple cast from a LLVM \c addrspace to \c Arg::File.
+ * - If it is a pointer to a struct whose name is either
+ * <tt>\%struct.image2d</tt> or <tt>\%struct.image3d</tt>, kind is set
+ * to \c Arg::Image2D or \c Arg::Image3D, respectively.
+ * - If it is a vector, \c vec_dim is set to the vector size, and the
+ * rest of the computations are done on the element type
+ * - Then we translate the LLVM type to an \c Arg::Kind. For instance,
+ * \c i32 becomes \c Arg::Int32
+ *
+ * Samplers aren't detected at this stage because they are plain \c i32
+ * types on the LLVM side. They are detected in \c setArg() when the
+ * value being set to the argument appears to be a \c Coal::Sampler.
+ *
+ * \param device device for which the function is added
+ * \param function function to add
+ * \param module LLVM module of this function
+ */
+ cl_int addFunction(DeviceInterface *device, llvm::Function *function,
+ llvm::Module *module);
+
+ /**
+ * \brief Get the LLVM function for a specified \p device
+ * \param device the device for which a LLVM function is needed
+ * \return the LLVM function for the given \p device
+ */
+ llvm::Function *function(DeviceInterface *device) const;
+
+ /**
+ * \brief Set the value of an argument
+ *
+ * See the constructor's documentation for a note on the
+ * \c Coal::Sampler objects
+ *
+ * \param index index of the argument
+ * \param size size of the value being stored in the argument, must match
+ * <tt>Arg::valueSize() * Arg::vecDim()</tt>
+ * \param value pointer to the data that will be copied in the argument
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int setArg(cl_uint index, size_t size, const void *value);
+
+ unsigned int numArgs() const; /*!< \brief Number of arguments of this kernel */
+ const Arg *arg(unsigned int index) const; /*!< \brief \c Arg at the given \p index */
+
+ /*! \brief \c Coal::DeviceKernel for the specified \p device */
+ DeviceKernel *deviceDependentKernel(DeviceInterface *device) const;
+ llvm::Module *deviceDependentModule(DeviceInterface *device) const;
+
+ bool argsSpecified() const; /*!< \brief true if all the arguments have been set through \c setArg() */
+ bool hasLocals() const; /*!< \brief true if one or more argument is in file \c Arg::Local */
+
+ /**
+ * \brief Get information about this kernel
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+
+ /**
+ * \brief Get performance hints and device-specific data about this kernel
+ * \copydetails Coal::DeviceInterface::info
+ * \param device \c Coal::DeviceInterface on which the kernel will be run
+ */
+ cl_int workGroupInfo(DeviceInterface *device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ boost::tuple<uint,uint,uint> reqdWorkGroupSize(llvm::Module *module) const;
+
+ int get_wi_alloca_size() { return wi_alloca_size; }
+
+ std::string p_name;
+ private:
+ bool p_has_locals;
+ int wi_alloca_size;
+
+ struct DeviceDependent
+ {
+ DeviceInterface *device;
+ DeviceKernel *kernel;
+ llvm::Function *function;
+ llvm::Module *module;
+ };
+
+ std::vector<DeviceDependent> p_device_dependent;
+ std::vector<Arg *> p_args;
+ DeviceDependent null_dep;
+
+ const DeviceDependent &deviceDependent(DeviceInterface *device) const;
+ DeviceDependent &deviceDependent(DeviceInterface *device);
+
+};
+
+}
+
+struct _cl_kernel : public Coal::Kernel
+{};
+
+#endif
diff --git a/src/core/memobject.cpp b/src/core/memobject.cpp
new file mode 100644
index 0000000..5501ac1
--- /dev/null
+++ b/src/core/memobject.cpp
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file memobject.cpp
+ * \brief Memory objects
+ */
+
+#include "CL/cl_ext.h"
+#include "memobject.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+#include "events.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+using namespace Coal;
+
+/*
+ * MemObject
+ */
+
+MemObject::MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr,
+ cl_int *errcode_ret)
+: Object(Object::T_MemObject, ctx), p_num_devices(0), p_flags(flags),
+ p_host_ptr(host_ptr), p_devicebuffers(0), p_dtor_callback_stack()
+{
+ // Check the flags value
+ const cl_mem_flags all_flags = CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY |
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR |
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+ |CL_MEM_USE_MSMC_TI;
+
+ if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_READ_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+ if ((flags & CL_MEM_READ_WRITE) && (flags & CL_MEM_WRITE_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+ if ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_WRITE_ONLY))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & ~all_flags) != 0)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check other values
+ if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) != 0 && !host_ptr)
+ {
+ *errcode_ret = CL_INVALID_HOST_PTR;
+ return;
+ }
+
+ if ((flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) == 0 && host_ptr)
+ {
+ *errcode_ret = CL_INVALID_HOST_PTR;
+ return;
+ }
+}
+
+MemObject::~MemObject()
+{
+ while (!p_dtor_callback_stack.empty())
+ {
+ dtor_callback_t callback;
+ if (p_dtor_callback_stack.pop(callback))
+ callback.first((cl_mem)this, callback.second);
+ }
+
+ if (p_devicebuffers)
+ {
+ // Also delete our children in the device
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ delete p_devicebuffers[i];
+
+ std::free((void *)p_devicebuffers);
+ }
+}
+
+cl_int MemObject::init()
+{
+ // Get the device list of the context
+ DeviceInterface **devices = 0;
+ cl_int rs;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ sizeof(unsigned int),
+ &p_num_devices, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ p_devices_to_allocate = p_num_devices;
+ devices = (DeviceInterface **)std::malloc(p_num_devices *
+ sizeof(DeviceInterface *));
+
+ if (!devices)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ p_num_devices * sizeof(DeviceInterface *),
+ devices, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ // Allocate a table of DeviceBuffers
+ p_devicebuffers = (DeviceBuffer **)std::malloc(p_num_devices *
+ sizeof(DeviceBuffer *));
+
+ if (!p_devicebuffers)
+ {
+ std::free((void *)devices);
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ // If we have more than one device, the allocation on the devices is
+ // defered to first use, so host_ptr can become invalid. So, copy it in
+ // a RAM location and keep it. Also, set a flag telling CPU devices that
+ // they don't need to reallocate and re-copy host_ptr
+ // SubBuffer should simply reuse Buffer data
+ if (p_num_devices > 1 && (p_flags & CL_MEM_COPY_HOST_PTR)
+ && type() != SubBuffer)
+ {
+ void *tmp_hostptr = std::malloc(size());
+
+ if (!tmp_hostptr)
+ {
+ std::free((void *)devices);
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ std::memcpy(tmp_hostptr, p_host_ptr, size());
+
+ p_host_ptr = tmp_hostptr;
+ // Now, the client application can safely std::free() its host_ptr
+ }
+
+ // Create a DeviceBuffer for each device
+ unsigned int failed_devices = 0;
+
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ {
+ DeviceInterface *device = devices[i];
+
+ rs = CL_SUCCESS;
+ p_devicebuffers[i] = device->createDeviceBuffer(this, &rs);
+
+ if (rs != CL_SUCCESS)
+ {
+ p_devicebuffers[i] = 0;
+ failed_devices++;
+ }
+ }
+
+ if (failed_devices == p_num_devices)
+ {
+ // Each device found a reason to reject the buffer, so it's invalid
+ std::free((void *)devices);
+ return rs;
+ }
+
+ std::free((void *)devices);
+ devices = 0;
+
+ // If we have only one device, already allocate the buffer
+ if (p_num_devices == 1)
+ {
+ if (!p_devicebuffers[0]->allocate())
+ return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ }
+
+ return CL_SUCCESS;
+}
+
+bool MemObject::allocate(DeviceInterface *device)
+{
+ DeviceBuffer *buffer = deviceBuffer(device);
+
+ if (!buffer->allocated())
+ {
+ return buffer->allocate();
+ }
+
+ return true;
+}
+
+cl_mem_flags MemObject::flags() const
+{
+ return p_flags;
+}
+
+void *MemObject::host_ptr() const
+{
+ if (type() != SubBuffer)
+ return p_host_ptr;
+ else
+ {
+ const class SubBuffer *subbuf = (const class SubBuffer *)this;
+ char *tmp = (char *)subbuf->parent()->host_ptr();
+
+ if (!tmp) return 0;
+
+ tmp += subbuf->offset();
+
+ return (void *)tmp;
+ }
+}
+
+DeviceBuffer *MemObject::deviceBuffer(DeviceInterface *device) const
+{
+ for (unsigned int i=0; i<p_num_devices; ++i)
+ {
+ if (p_devicebuffers[i]->device() == device)
+ return p_devicebuffers[i];
+ }
+
+ return 0;
+}
+
+void MemObject::deviceAllocated(DeviceBuffer *buffer)
+{
+ (void) buffer;
+
+ // Decrement the count of devices that must be allocated. If it becomes
+ // 0, it means we don't need to keep a copied host_ptr and that we can
+ // std::free() it.
+ p_devices_to_allocate--;
+
+ if (p_devices_to_allocate == 0 &&
+ p_num_devices > 1 &&
+ (p_flags & CL_MEM_COPY_HOST_PTR))
+ {
+ std::free(p_host_ptr);
+ p_host_ptr = 0;
+ }
+
+}
+
+void MemObject::setDestructorCallback(void (CL_CALLBACK *pfn_notify)
+ (cl_mem memobj, void *user_data),
+ void *user_data)
+{
+ p_dtor_callback_stack.push(dtor_callback_t(pfn_notify, user_data));
+}
+
+// HACK for the union
+typedef void * void_p;
+
+cl_int MemObject::info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ class SubBuffer *subbuf = (class SubBuffer *)this;
+
+ union {
+ cl_mem_object_type cl_mem_object_type_var;
+ cl_mem_flags cl_mem_flags_var;
+ size_t size_t_var;
+ void_p void_p_var;
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ cl_mem cl_mem_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_MEM_TYPE:
+ switch (type())
+ {
+ case Buffer:
+ case SubBuffer:
+ cl_mem_object_type_var = CL_MEM_OBJECT_BUFFER;
+ break;
+
+ case Image2D:
+ cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE2D;
+ break;
+
+ case Image3D:
+ cl_mem_object_type_var = CL_MEM_OBJECT_IMAGE3D;
+ break;
+ }
+ value = (void *)&cl_mem_object_type_var;
+ value_length = sizeof(cl_mem_object_type);
+ break;
+
+ case CL_MEM_FLAGS:
+ SIMPLE_ASSIGN(cl_mem_flags, p_flags);
+ break;
+
+ case CL_MEM_SIZE:
+ SIMPLE_ASSIGN(size_t, size());
+ break;
+
+ case CL_MEM_HOST_PTR:
+ SIMPLE_ASSIGN(void_p, host_ptr());
+ break;
+
+ case CL_MEM_MAP_COUNT:
+ SIMPLE_ASSIGN(cl_uint, 0); // TODO
+ break;
+
+ case CL_MEM_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_MEM_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_MEM_ASSOCIATED_MEMOBJECT:
+ if (type() != SubBuffer)
+ SIMPLE_ASSIGN(cl_mem, 0)
+ else
+ SIMPLE_ASSIGN(cl_mem, subbuf->parent());
+ break;
+
+ case CL_MEM_OFFSET:
+ if (type() != SubBuffer)
+ SIMPLE_ASSIGN(cl_mem, 0)
+ else
+ SIMPLE_ASSIGN(cl_mem, subbuf->offset());
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+/*
+ * Buffer
+ */
+
+Buffer::Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags,
+ cl_int *errcode_ret)
+: MemObject(ctx, flags, host_ptr, errcode_ret), p_size(size)
+{
+ if (size == 0)
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+#if defined(__arm__)
+ if (size > 512*1024*1024)
+#else
+ if (size > 1*1024*1024*1024)
+#endif
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ // CL_MEM_READ_WRITE is default if not specified {READ,WRITE}_ONLY
+ if (! (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)))
+ p_flags |= CL_MEM_READ_WRITE;
+}
+
+size_t Buffer::size() const
+{
+ return p_size;
+}
+
+MemObject::Type Buffer::type() const
+{
+ return MemObject::Buffer;
+}
+
+/*----------------------------------------------------------------------------
+ * mapped_event: MapBufferEvent when the Map is on a Buffer
+ * RETURN: true if successful, false if fail
+ * Traverse currently mapped event list, check overlapping and if either is
+ * WRITE, insert into list in the increasing order of offset
+ * TODO: do we need to lock the list for operation???
+ *---------------------------------------------------------------------------*/
+bool Buffer::addMapEvent(BufferEvent *mapped_event)
+{
+ MapBufferEvent *mbe = (MapBufferEvent *) mapped_event;
+ size_t mbe_offset = mbe->offset();
+ if (mbe->buffer()->type() == SubBuffer)
+ mbe_offset += ((class SubBuffer *) mbe->buffer())->offset();
+
+ std::list<BufferEvent *>::iterator it, it_insert = p_mapped_events.end();
+ for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it)
+ {
+ MapBufferEvent *e = (MapBufferEvent *) (*it);
+ size_t e_offset = e->offset();
+ if (e->buffer()->type() == SubBuffer)
+ e_offset += ((class SubBuffer *) e->buffer())->offset();
+ if (mbe_offset < e_offset) it_insert = it;
+
+ if ( mbe_offset <= e_offset + e->cb() - 1
+ && e_offset <= mbe_offset + mbe->cb() - 1)
+ if ((mbe->flags() & CL_MAP_WRITE) ||
+ (e->flags() & CL_MAP_WRITE))
+ return false;
+ }
+
+ p_mapped_events.insert(it_insert, mapped_event);
+ return true;
+}
+
+/*----------------------------------------------------------------------------
+ * mapped_ptr: mapped pointer from previous MapBuffer/MapImage Event
+ * RETURN: first MappedBufferEvent with same mapped_ptr in the list
+ * TODO: do we need to lock the list for operation???
+ *---------------------------------------------------------------------------*/
+BufferEvent* Buffer::removeMapEvent(void *mapped_ptr)
+{
+ std::list<BufferEvent *>::iterator it;
+ for (it = p_mapped_events.begin(); it != p_mapped_events.end(); ++it)
+ {
+ MapBufferEvent *e = (MapBufferEvent *) (*it);
+ if (e->ptr() != mapped_ptr) continue;
+ p_mapped_events.erase(it);
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * SubBuffer
+ */
+
+SubBuffer::SubBuffer(class Buffer *parent, size_t offset, size_t size,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: MemObject((Context *)parent->parent(), flags, 0, errcode_ret), p_offset(offset),
+ p_size(size), p_parent(parent)
+{
+ clRetainMemObject((cl_mem) p_parent);
+
+ if (size == 0)
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ if (offset + size > parent->size())
+ {
+ *errcode_ret = CL_INVALID_BUFFER_SIZE;
+ return;
+ }
+
+ // Check the compatibility of flags and parent->flags()
+ const cl_mem_flags wrong_flags =
+ CL_MEM_ALLOC_HOST_PTR |
+ CL_MEM_USE_HOST_PTR |
+ CL_MEM_COPY_HOST_PTR;
+
+ if (flags & wrong_flags)
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((parent->flags() & CL_MEM_WRITE_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY)))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ if ((parent->flags() & CL_MEM_READ_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY)))
+ {
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // OpenCL 1.2: SubBuffer should inherit some of parent Buffer flags
+ cl_mem_flags parent_rw_flags = parent->flags()
+ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
+ cl_mem_flags my_rw_flags = p_flags
+ & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
+ // parent be READ_WRITE, subBuffer be READ_ONLY/WRITE_ONLY (Spec allows)
+ if (! my_rw_flags) p_flags |= parent_rw_flags;
+ cl_mem_flags parent_hostptr_flags = parent->flags()
+ & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR);
+ if (parent_hostptr_flags) p_flags |= parent_hostptr_flags;
+}
+
+SubBuffer::~SubBuffer()
+{
+ clReleaseMemObject((cl_mem) p_parent);
+}
+
+size_t SubBuffer::size() const
+{
+ return p_size;
+}
+
+MemObject::Type SubBuffer::type() const
+{
+ return MemObject::SubBuffer;
+}
+
+bool SubBuffer::allocate(DeviceInterface *device)
+{
+ // SubBuffer always use Buffer's data
+ return p_parent->allocate(device);
+}
+
+size_t SubBuffer::offset() const
+{
+ return p_offset;
+}
+
+Buffer *SubBuffer::parent() const
+{
+ return p_parent;
+}
+
+bool SubBuffer::addMapEvent(BufferEvent *mapped_event)
+{
+ return p_parent->addMapEvent(mapped_event);
+}
+
+BufferEvent* SubBuffer::removeMapEvent(void *mapped_ptr)
+{
+ return p_parent->removeMapEvent(mapped_ptr);
+}
+
+/*
+ * Image2D
+ */
+
+Image2D::Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: MemObject(ctx, flags, host_ptr, errcode_ret),
+ p_width(width), p_height(height), p_row_pitch(row_pitch)
+{
+ if (!width || !height)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ if (!format)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+
+ p_format = *format;
+
+ // Check format descriptor
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT_101010:
+ case CL_UNORM_SHORT_555:
+ case CL_UNORM_SHORT_565:
+ if (p_format.image_channel_order != CL_RGB ||
+ p_format.image_channel_order != CL_RGBx)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ }
+
+ switch (p_format.image_channel_order)
+ {
+ case CL_LUMINANCE:
+ case CL_INTENSITY:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT8:
+ case CL_UNORM_INT16:
+ case CL_SNORM_INT8:
+ case CL_SNORM_INT16:
+ case CL_HALF_FLOAT:
+ case CL_FLOAT:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+
+ case CL_RGB:
+ case CL_RGBx:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_SHORT_555:
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_INT_101010:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+
+ case CL_ARGB:
+ case CL_BGRA:
+ switch (p_format.image_channel_data_type)
+ {
+ case CL_UNORM_INT8:
+ case CL_SNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ break;
+ default:
+ *errcode_ret = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ return;
+ }
+ break;
+ }
+
+ // Row pitch
+ p_row_pitch = width * pixel_size(p_format);
+
+ if (row_pitch)
+ {
+ if (!host_ptr)
+ {
+ // row_pitch must be 0 if host_ptr is null
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (row_pitch < p_row_pitch)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (row_pitch % pixel_size(p_format) != 0)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ p_row_pitch = row_pitch;
+ }
+}
+
+size_t Image2D::size() const
+{
+ return height() * row_pitch();
+}
+
+MemObject::Type Image2D::type() const
+{
+ return MemObject::Image2D;
+}
+
+size_t Image2D::width() const
+{
+ return p_width;
+}
+
+size_t Image2D::height() const
+{
+ return p_height;
+}
+
+size_t Image2D::row_pitch() const
+{
+ return p_row_pitch;
+}
+
+size_t Image2D::slice_pitch() const
+{
+ // An Image2D is made of only one slice
+ return size();
+}
+
+const cl_image_format &Image2D::format() const
+{
+ return p_format;
+}
+
+cl_int Image2D::imageInfo(cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ class Image3D *image3D = (class Image3D *)this;
+
+ union {
+ cl_image_format cl_image_format_var;
+ size_t size_t_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_IMAGE_FORMAT:
+ SIMPLE_ASSIGN(cl_image_format, format());
+ break;
+
+ case CL_IMAGE_ELEMENT_SIZE:
+ SIMPLE_ASSIGN(size_t, element_size(p_format));
+ break;
+
+ case CL_IMAGE_ROW_PITCH:
+ // TODO: What was given when the image was created or width*size ?
+ SIMPLE_ASSIGN(size_t, row_pitch());
+ break;
+
+ case CL_IMAGE_SLICE_PITCH:
+ if (type() == Image3D)
+ SIMPLE_ASSIGN(size_t, image3D->slice_pitch())
+ else
+ SIMPLE_ASSIGN(size_t, 0);
+ break;
+
+ case CL_IMAGE_WIDTH:
+ SIMPLE_ASSIGN(size_t, width());
+ break;
+
+ case CL_IMAGE_HEIGHT:
+ SIMPLE_ASSIGN(size_t, height());
+ break;
+
+ case CL_IMAGE_DEPTH:
+ if (type() == Image3D)
+ SIMPLE_ASSIGN(size_t, image3D->depth())
+ else
+ SIMPLE_ASSIGN(size_t, 0);
+ break;
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+size_t Image2D::element_size(const cl_image_format &format)
+{
+ switch (format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ return 1;
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16:
+ case CL_SIGNED_INT16:
+ case CL_UNSIGNED_INT16:
+ return 2;
+ case CL_SIGNED_INT32:
+ case CL_UNSIGNED_INT32:
+ return 4;
+ case CL_FLOAT:
+ return sizeof(float);
+ case CL_HALF_FLOAT:
+ return 2;
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ return 2;
+ case CL_UNORM_INT_101010:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+unsigned int Image2D::channels(const cl_image_format &format)
+{
+ switch (format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ return 1;
+ break;
+
+ case CL_RG:
+ case CL_RGx:
+ case CL_RA:
+ return 2;
+ break;
+
+ case CL_RGBA:
+ case CL_ARGB:
+ case CL_BGRA:
+ return 4;
+ break;
+
+ case CL_RGBx:
+ case CL_RGB:
+ return 1; // Only special data types allowed (565, 555, etc)
+ break;
+
+ default:
+ return 0;
+ }
+}
+
+size_t Image2D::pixel_size(const cl_image_format &format)
+{
+ switch (format.image_channel_data_type)
+ {
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ return 2;
+ case CL_UNORM_INT_101010:
+ return 4;
+ default:
+ return channels(format) * element_size(format);
+ }
+}
+
+size_t Image2D::element_size() const
+{
+ return element_size(p_format);
+}
+
+size_t Image2D::pixel_size() const
+{
+ return pixel_size(p_format);
+}
+
+unsigned int Image2D::channels() const
+{
+ return channels(p_format);
+}
+
+/*
+ * Image3D
+ */
+
+Image3D::Image3D(Context *ctx, size_t width, size_t height, size_t depth,
+ size_t row_pitch, size_t slice_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret)
+: Image2D(ctx, width, height, row_pitch, format, host_ptr, flags, errcode_ret),
+ p_depth(depth)
+{
+ if (depth <= 1)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ // Slice pitch
+ p_slice_pitch = height * this->row_pitch();
+
+ if (slice_pitch)
+ {
+ if (!host_ptr)
+ {
+ // slice_pitch must be 0 if host_ptr is null
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (slice_pitch < p_slice_pitch)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ if (slice_pitch % this->row_pitch() != 0)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+
+ p_slice_pitch = slice_pitch;
+ }
+}
+
+size_t Image3D::size() const
+{
+ return depth() * slice_pitch();
+}
+
+MemObject::Type Image3D::type() const
+{
+ return MemObject::Image3D;
+}
+
+size_t Image3D::depth() const
+{
+ return p_depth;
+}
+
+size_t Image3D::slice_pitch() const
+{
+ return p_slice_pitch;
+}
diff --git a/src/core/memobject.h b/src/core/memobject.h
new file mode 100644
index 0000000..82cbfab
--- /dev/null
+++ b/src/core/memobject.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file memobject.h
+ * \brief Memory objects
+ */
+
+#ifndef __MEMOBJECT_H__
+#define __MEMOBJECT_H__
+
+#include "object.h"
+#include "dsp/u_concurrent_stack.h"
+
+#include <CL/cl.h>
+
+namespace Coal
+{
+
+class DeviceBuffer;
+class Context;
+class DeviceInterface;
+class BufferEvent;
+
+/**
+ * \brief Base class for all the memory objects
+ */
+class MemObject : public Object
+{
+ public:
+ /**
+ * \brief Type of memory object
+ */
+ enum Type
+ {
+ Buffer,
+ SubBuffer,
+ Image2D,
+ Image3D
+ };
+
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param flags memory object flags
+ * \param host_ptr host pointer used by some flags (see the OpenCL spec)
+ * \param errcode_ret return value
+ * \note Don't do any initialization here, but in \c init(). We only fill
+ * the private variables and check the values passed in argument.
+ * \sa init
+ */
+ MemObject(Context *ctx, cl_mem_flags flags, void *host_ptr,
+ cl_int *errcode_ret);
+ virtual ~MemObject();
+
+ /**
+ * \brief Initialize the memory object
+ *
+ * Memory objects are device-independent classes. This function creates
+ * one \c Coal::DeviceBuffer per device present in the context by
+ * calling \c Coal::DeviceInterface::createDeviceBuffer().
+ *
+ * If there is only one device, its \c Coal::DeviceBuffer is directly
+ * allocated. If there are more than one device, the allocation is
+ * deferred until a \c Coal::Event is pushed for this device.
+ *
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ virtual cl_int init();
+ virtual bool allocate(DeviceInterface *device); /*!< \brief Allocate this memory object on the given \p device */
+ virtual size_t size() const = 0; /*!< \brief Device-independent size of the memory object */
+ virtual Type type() const = 0; /*!< \brief Type of the memory object */
+
+ cl_mem_flags flags() const; /*!< \brief Flags */
+ void *host_ptr() const; /*!< \brief Host pointer */
+ DeviceBuffer *deviceBuffer(DeviceInterface *device) const; /*!< \brief \c Coal::DeviceBuffer for the given \p device */
+
+ void deviceAllocated(DeviceBuffer *buffer); /*!< \brief Is the \c Coal::DeviceBuffer for \p buffer allocated ? */
+
+ /**
+ * \brief Set a destructor callback for this memory object
+ *
+ * This callback is called when this memory object is deleted. It is
+ * currently called from the destructor, so the memory object is already
+ * invalid, but as OpenCL objects are immutable, the callback cannot
+ * use its \c memobj parameter except in a pointer comparison, and there
+ * is no problem.
+ *
+ * \param pfn_notify function to call when the memory object is deleted
+ * \param user_data user data to pass to this function
+ */
+ void setDestructorCallback(void (CL_CALLBACK *pfn_notify)(cl_mem memobj,
+ void *user_data),
+ void *user_data);
+
+ /**
+ * \brief Get information about this memory object
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ virtual bool addMapEvent(BufferEvent *mapped_event) { return false; }
+ virtual BufferEvent* removeMapEvent(void *mapped_ptr) { return NULL; }
+
+ protected:
+ cl_mem_flags p_flags;
+ std::list<BufferEvent *> p_mapped_events;
+
+ private:
+ unsigned int p_num_devices, p_devices_to_allocate;
+ void *p_host_ptr;
+ DeviceBuffer **p_devicebuffers;
+
+ typedef std::pair<void (CL_CALLBACK *)(cl_mem memobj, void *user_data), void*> dtor_callback_t;
+ concurrent_stack<dtor_callback_t> p_dtor_callback_stack;
+
+ //void (CL_CALLBACK *p_dtor_callback)(cl_mem memobj, void *user_data);
+ //void *p_dtor_userdata;
+};
+
+/**
+ * \brief Simple buffer object
+ */
+class Buffer : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param size size of the buffer, in bytes
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Buffer(Context *ctx, size_t size, void *host_ptr, cl_mem_flags flags,
+ cl_int *errcode_ret);
+
+ size_t size() const; /*!< \brief Size of the buffer, in bytes */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Buffer */
+
+ bool addMapEvent(BufferEvent *mapped_event);
+ BufferEvent* removeMapEvent(void *mapped_ptr);
+ private:
+ size_t p_size;
+
+};
+
+/**
+ * \brief Sub-buffer
+ */
+class SubBuffer : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param parent parent \c Coal::Buffer
+ * \param offset offset in \p parent of the start of this sub-buffer
+ * \param size size of the sub-buffer
+ * \param flags memory flags (must be compatible with the \p parent's ones)
+ * \param errcode_ret return code
+ */
+ SubBuffer(class Buffer *parent, size_t offset, size_t size,
+ cl_mem_flags flags, cl_int *errcode_ret);
+ ~SubBuffer();
+
+ size_t size() const; /*!< \brief Size */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::SubBuffer */
+ bool allocate(DeviceInterface *device); /*!< \brief Allocate the \b parent \c Coal::Buffer */
+
+ size_t offset() const; /*!< \brief Offset in bytes */
+ class Buffer *parent() const; /*!< \brief Parent \c Coal::Buffer */
+
+ bool addMapEvent(BufferEvent *mapped_event);
+ BufferEvent* removeMapEvent(void *mapped_ptr);
+ private:
+ size_t p_offset, p_size;
+ class Buffer *p_parent;
+};
+
+/**
+ * \brief 2D image
+ */
+class Image2D : public MemObject
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param width width of the image
+ * \param height height of the image
+ * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt>
+ * \param format image format
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Image2D(Context *ctx, size_t width, size_t height, size_t row_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret);
+
+ virtual size_t size() const; /*!< \brief Size in bytes */
+ virtual Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image2D */
+
+ size_t width() const; /*!< \brief Width */
+ size_t height() const; /*!< \brief Height */
+ size_t row_pitch() const; /*!< \brief Size in bytes of a row of pixels */
+ virtual size_t slice_pitch() const; /*!< \brief Size in bytes of the image */
+ const cl_image_format &format() const; /*!< \brief Image format descriptor */
+
+ /**
+ * \brief Information about this image object
+ *
+ * This function is also usable for \c Coal::Image3D objects as it does
+ * casting when necessary in order to give information when needed.
+ *
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int imageInfo(cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ static size_t element_size(const cl_image_format &format); /*!< \brief Size in bytes of each channel of \p format */
+ static unsigned int channels(const cl_image_format &format);/*!< \brief Number of channels of \p format */
+ static size_t pixel_size(const cl_image_format &format); /*!< \brief Size in bytes of a pixel in \p format */
+ size_t pixel_size() const; /*!< \brief Pixel size of this image */
+ size_t element_size() const; /*!< \brief Channel size of this image */
+ unsigned int channels() const; /*!< \brief Number of channels of this image */
+
+ private:
+ size_t p_width, p_height, p_row_pitch;
+ cl_image_format p_format;
+};
+
+/**
+ * \brief 3D image
+ */
+class Image3D : public Image2D
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param width width of the image
+ * \param height height of the image
+ * \param depth depth of the image
+ * \param row_pitch number of bytes in a row of pixels. If 0, defaults to <tt>width * pixel_size()</tt>
+ * \param slice_pitch number of bytes in a 2D slice. If 0, defaults to <tt>height * row_pitch()</tt>
+ * \param format image format
+ * \param host_ptr host pointer
+ * \param flags memory flags
+ * \param errcode_ret return code
+ */
+ Image3D(Context *ctx, size_t width, size_t height, size_t depth,
+ size_t row_pitch, size_t slice_pitch,
+ const cl_image_format *format, void *host_ptr,
+ cl_mem_flags flags, cl_int *errcode_ret);
+
+ size_t size() const; /*!< \brief Size in bytes of this image */
+ Type type() const; /*!< \brief Return that we are a \c Coal::MemObject::Image3D */
+
+ size_t depth() const; /*!< \brief Depth of the image */
+ size_t slice_pitch() const; /*!< \brief Size in bytes of a 2D slice */
+
+ private:
+ size_t p_depth, p_slice_pitch;
+};
+
+}
+
+struct _cl_mem : public Coal::MemObject
+{};
+
+#endif
diff --git a/src/core/object.cpp b/src/core/object.cpp
new file mode 100644
index 0000000..be44279
--- /dev/null
+++ b/src/core/object.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file object.cpp
+ * \brief Reference-counted object tree
+ */
+
+#include "object.h"
+
+using namespace Coal;
+
+static std::list<Object *>& getKnownObjects()
+{
+ static std::list<Object *> known_objects;
+ return known_objects;
+}
+
+
+Object::Object(Type type, Object *parent)
+: p_references(1), p_parent(parent), p_type(type), p_release_parent(true)
+{
+ if (parent)
+ parent->reference();
+
+ // Add object in the list of known objects
+ getKnownObjects().push_front(this);
+ p_it = getKnownObjects().begin();
+}
+
+Object::~Object()
+{
+ if (p_parent && p_parent->dereference() && p_release_parent)
+ delete p_parent;
+
+ // Remove object from the list of known objects
+ getKnownObjects().erase(p_it);
+}
+
+void Object::reference()
+{
+ p_references++;
+}
+
+bool Object::dereference()
+{
+ p_references--;
+ return (p_references == 0);
+}
+
+void Object::setReleaseParent (bool release)
+{
+ p_release_parent = release;
+}
+
+unsigned int Object::references() const
+{
+ return p_references;
+}
+
+Object *Object::parent() const
+{
+ return p_parent;
+}
+
+Object::Type Object::type() const
+{
+ return p_type;
+}
+
+bool Object::isA(Object::Type type) const
+{
+ // Check for null values
+ if (this == 0)
+ return false;
+
+ // Check that the value isn't garbage or freed pointer
+ std::list<Object *>::const_iterator it = getKnownObjects().begin(),
+ e = getKnownObjects().end();
+ while (it != e)
+ {
+ if (*it == this)
+ // OK, NOW it is safe to dereference this ptr:
+ return this->type() == type;
+
+ ++it;
+ }
+
+ return false;
+}
diff --git a/src/core/object.h b/src/core/object.h
new file mode 100644
index 0000000..d83e326
--- /dev/null
+++ b/src/core/object.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file object.h
+ * \brief Object tree
+ */
+
+#ifndef __REFCOUNTED_H__
+#define __REFCOUNTED_H__
+
+#include <list>
+
+namespace Coal
+{
+
+/**
+ * \brief Base class of all the Clover objects
+ *
+ * This class implements functions needed by all the Clover objects, like
+ * reference counting, the object tree (parents/children), etc.
+ *
+ * It also uses a special list of known objects, used to check that a pointer
+ * passed by the user to an OpenCL function actually is an object of the correct
+ * type. See \c isA().
+ */
+class Object
+{
+ public:
+ /**
+ * \brief Type of object the inherited class actually is
+ */
+ enum Type
+ {
+ T_Device, /*!< \brief \c Coal::DeviceInterface */
+ T_CommandQueue, /*!< \brief \c Coal::CommandQueue */
+ T_Event, /*!< \brief \c Coal::Event */
+ T_Context, /*!< \brief \c Coal::Context */
+ T_Kernel, /*!< \brief \c Coal::Kernel */
+ T_MemObject, /*!< \brief \c Coal::MemObject */
+ T_Program, /*!< \brief \c Coal::Program */
+ T_Sampler /*!< \brief \c Coal::Sampler */
+ };
+
+ /**
+ * \brief Constructor
+ * \param type type of the child class calling this constructor
+ * \param parent parent object
+ */
+ Object(Type type, Object *parent = 0);
+ virtual ~Object();
+
+ /**
+ * \brief Increments the reference counter
+ */
+ void reference();
+
+ /**
+ * \brief Decrements the reference counter
+ * \return true if the reference counter has reached 0
+ */
+ bool dereference();
+
+ /**
+ * \brief Reference counter
+ * \return the number of references of this class currently in use
+ */
+ unsigned int references() const;
+
+ /**
+ * \brief Set if the parent object has to be deleted if its reference count reaches 0
+ *
+ * The destructor of \c Coal::Object dereferences its parent object.
+ * This is done in order to correctly free objects when no object has
+ * a reference to it anymore.
+ *
+ * Some objects such as \c Coal::CommandQueue need to do some operations
+ * before being deleted. This function tells \c Coal::Object to
+ * dereference its parent object, but not to call \b delete on it.
+ *
+ * \param release true to have \b delete called on the parent object
+ * when its reference count reaches 0, false to keep it
+ */
+ void setReleaseParent(bool release);
+
+ Object *parent() const; /*!< \brief Parent object */
+ Type type() const; /*!< \brief Type */
+
+ /**
+ * \brief Returns whether this object is an instance of \p type
+ * \note This function begins with a NULL-check on the \c this pointer,
+ * so it's safe to use even when \c this is not guaranteed not to
+ * be NULL.
+ * \param type type this object must have for the check to pass
+ * \return true if this object exists and has the correct type
+ */
+ bool isA(Type type) const;
+
+ private:
+ unsigned int p_references;
+ Object *p_parent;
+ Type p_type;
+ std::list<Object *>::iterator p_it;
+ bool p_release_parent;
+};
+
+}
+
+#endif
diff --git a/src/core/platform.cpp b/src/core/platform.cpp
new file mode 100644
index 0000000..1af6153
--- /dev/null
+++ b/src/core/platform.cpp
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <list>
+#include <iostream>
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "platform.h"
+#include "propertylist.h"
+#include "object.h"
+#include "cpu/device.h"
+#ifndef SHAMROCK_BUILD
+#include "dsp/device.h"
+#include "dsp/driver.h"
+#endif
+
+/*-----------------------------------------------------------------------------
+* For the lock file
+*----------------------------------------------------------------------------*/
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+using namespace Coal;
+
+/******************************************************************************
+* begin_file_lock_crit_section
+******************************************************************************/
+static int begin_file_lock_crit_section(char* fname)
+{
+ /*---------------------------------------------------------------------
+ * Create a lock, so only 1 OpenCL program can progress at a time.
+ * I'm not sure about the appropriateness of putting this in the ctor.
+ * We may look at delayed ctor of platform with this in it.
+ *--------------------------------------------------------------------*/
+ int lock_fd = open(fname, O_CREAT,
+ S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH);
+
+ std::string str_fname(fname);
+
+ if (lock_fd < 0)
+ {
+ std::cout << "Can not open lock file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+
+ int res = flock(lock_fd, LOCK_EX|LOCK_NB);
+ if (res == -1)
+ {
+ if (errno == EWOULDBLOCK)
+ {
+ std::cout << "Waiting on lock " << str_fname << " ..." << std::endl;
+ res = flock(lock_fd, LOCK_EX);
+ if (res == -1)
+ {
+ std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+ else std::cout << "Acquired lock " << str_fname << ", Proceeding!" << std::endl;
+ }
+ else
+ {
+ std::cout << "Error Locking file " << str_fname << ", Aborting !" << std::endl;
+ exit(-1);
+ }
+ }
+
+ return lock_fd;
+
+}
+
+namespace Coal
+{
+ Platform::Platform() : dispatch(&dispatch_table)
+ {
+ char filename[] = "/var/lock/opencl";
+ p_lock_fd = begin_file_lock_crit_section(filename);
+
+ p_devices.push_back((_cl_device_id*)new Coal::CPUDevice);
+
+ // Driver class only exists for the DSPDevice, so need this guard:
+#ifndef SHAMROCK_BUILD
+ for (int i = 0; i < Driver::instance()->num_dsps(); i++)
+ p_devices.push_back((_cl_device_id*)new Coal::DSPDevice(i));
+#endif
+ }
+
+ Platform::~Platform()
+ {
+ flock(p_lock_fd, LOCK_UN);
+ close(p_lock_fd);
+
+ for (int i = 0; i < p_devices.size(); i++)
+ delete p_devices[i];
+ }
+
+ cl_uint Platform::getDevices(cl_device_type device_type,
+ cl_uint num_entries, cl_device_id * devices)
+ {
+ cl_uint device_number = 0;
+
+ if (device_type == CL_DEVICE_TYPE_DEFAULT)
+#ifdef SHAMROCK_BUILD
+ device_type = CL_DEVICE_TYPE_CPU;
+#else
+ device_type = CL_DEVICE_TYPE_ACCELERATOR;
+#endif
+
+ for (int d = 0; d < p_devices.size(); d++)
+ {
+ cl_device_type type;
+ p_devices[d]->info(CL_DEVICE_TYPE, sizeof(cl_device_type), &type,0);
+
+ if (type & device_type)
+ {
+ if (devices && device_number < num_entries)
+ devices[device_number++] = p_devices[d];
+ else device_number++;
+ }
+ }
+
+ return device_number;
+ }
+
+ cl_int Platform::info(cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+ {
+ void *value = 0;
+ size_t value_length = 0;
+
+ switch (param_name)
+ {
+ case CL_PLATFORM_PROFILE:
+ STRING_ASSIGN("FULL_PROFILE");
+ break;
+
+ case CL_PLATFORM_VERSION:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("OpenCL 1.1 Shamrock ");
+#else
+ STRING_ASSIGN("OpenCL 1.1 TI ");
+#endif
+ break;
+
+ case CL_PLATFORM_NAME:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("Shamrock OpenCL for Arm");
+#else
+#if defined(__arm__)
+ STRING_ASSIGN("TI OpenCL for Arm + Dsp");
+#else
+ STRING_ASSIGN("TI OpenCL for Advantech DSPC868x");
+#endif
+#endif
+ break;
+
+ case CL_PLATFORM_VENDOR:
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("Open Source Software");
+#else
+ STRING_ASSIGN("Texas Instruments, Inc.");
+#endif
+ break;
+
+ case CL_PLATFORM_EXTENSIONS:
+ // TODO add cl_khr_icd when it works
+#ifdef SHAMROCK_BUILD
+ STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64");
+#else
+ STRING_ASSIGN("cl_khr_byte_addressable_store cl_khr_fp64 cl_ti_msmc_buffers");
+#endif
+ break;
+
+ case CL_PLATFORM_ICD_SUFFIX_KHR:
+#ifndef SHAMROCK_BUILD
+ STRING_ASSIGN("TI");
+#endif
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+ }
+};
+
+_cl_platform_id the_platform;
diff --git a/src/core/platform.h b/src/core/platform.h
new file mode 100644
index 0000000..809d12c
--- /dev/null
+++ b/src/core/platform.h
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef __PLATFORM_H__
+#define __PLATFORM_H__
+
+#include <CL/cl.h>
+#include <vector>
+#include <cstring>
+#include "icd.h"
+
+namespace Coal
+{
+
+class Platform
+{
+ public:
+ Platform();
+ ~Platform();
+
+ cl_uint getDevices(cl_device_type device_type,
+ cl_uint num_entries, cl_device_id * devices);
+
+ cl_int info(cl_platform_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ private:
+ KHRicdVendorDispatch *dispatch;
+ std::vector <cl_device_id> p_devices;
+ int p_lock_fd;
+};
+
+}
+
+struct _cl_platform_id : public Coal::Platform
+{};
+
+extern _cl_platform_id the_platform;
+#endif
diff --git a/src/core/program.cpp b/src/core/program.cpp
new file mode 100644
index 0000000..5f6e99f
--- /dev/null
+++ b/src/core/program.cpp
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/program.cpp
+ * \brief Program
+ */
+
+#include "program.h"
+#include "context.h"
+#include "compiler.h"
+#include "kernel.h"
+#include "propertylist.h"
+#include "deviceinterface.h"
+
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <set>
+#include <algorithm>
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorOr.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Linker/Linker.h>
+#include <llvm/PassManager.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/InstIterator.h>
+
+#include <runtime/stdlib.c.bc.embed.h>
+
+
+/*-----------------------------------------------------------------------------
+* temporary for source file cacheing, remove from product releases
+*----------------------------------------------------------------------------*/
+//#include "dsp/source_cache.h"
+//source_cache * source_cache::pInstance = 0;
+
+using namespace Coal;
+using namespace llvm;
+
+Program::Program(Context *ctx)
+: Object(Object::T_Program, ctx), p_type(Invalid), p_state(Empty)
+{
+ p_null_device_dependent.compiler = 0;
+ p_null_device_dependent.device = 0;
+ p_null_device_dependent.linked_module = 0;
+ p_null_device_dependent.program = 0;
+}
+
+Program::~Program()
+{
+ resetDeviceDependent();
+}
+
+void Program::resetDeviceDependent()
+{
+ while (p_device_dependent.size())
+ {
+ DeviceDependent &dep = p_device_dependent.back();
+
+ delete dep.compiler;
+ delete dep.program;
+ delete dep.linked_module;
+
+ p_device_dependent.pop_back();
+ }
+}
+
+void Program::setDevices(cl_uint num_devices, DeviceInterface * const*devices)
+{
+ p_device_dependent.resize(num_devices);
+
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ DeviceDependent &dep = p_device_dependent[i];
+
+ dep.device = devices[i];
+ dep.program = dep.device->createDeviceProgram(this);
+ dep.is_native_binary = false;
+ dep.linked_module = 0;
+ dep.compiler = new Compiler(dep.device);
+ }
+}
+
+Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device)
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return p_null_device_dependent;
+}
+
+const Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) const
+{
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+
+ return p_null_device_dependent;
+}
+
+DeviceProgram *Program::deviceDependentProgram(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.program;
+}
+
+std::string Program::deviceDependentCompilerOptions(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.compiler->options();
+}
+
+std::vector<llvm::Function *> Program::kernelFunctions(DeviceDependent &dep)
+{
+ std::vector<llvm::Function *> rs;
+
+ llvm::NamedMDNode *kernels =
+ dep.linked_module->getNamedMetadata("opencl.kernels");
+
+ if (!kernels) return rs;
+
+ for (unsigned int i=0; i<kernels->getNumOperands(); ++i)
+ {
+ llvm::MDNode *node = kernels->getOperand(i);
+
+ /*---------------------------------------------------------------------
+ * Each node has only one operand : a llvm::Function
+ *--------------------------------------------------------------------*/
+ llvm::Value *value = node->getOperand(0);
+
+ /*---------------------------------------------------------------------
+ * Bug somewhere, don't crash
+ *--------------------------------------------------------------------*/
+ if (!llvm::isa<llvm::Function>(value)) continue;
+
+ llvm::Function *f = llvm::cast<llvm::Function>(value);
+ rs.push_back(f);
+ }
+
+ return rs;
+}
+
+/******************************************************************************
+* Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret)
+******************************************************************************/
+Kernel *Program::createKernel(const std::string &name, cl_int *errcode_ret)
+{
+ Kernel *rs = NULL;
+
+ for (size_t i=0; i < kernelList.size(); i++)
+ {
+ if (kernelList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ return kernelList[i];
+ }
+ }
+ /* Now check the previously released list */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ if (kernelReleasedList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ rs = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(rs);
+
+ return rs;
+ }
+ }
+
+ rs = new Kernel(this);
+
+ /*-------------------------------------------------------------------------
+ * Add a function definition for each device
+ *------------------------------------------------------------------------*/
+ for (size_t i=0; i < p_device_dependent.size(); ++i)
+ {
+ bool found = false;
+ DeviceDependent &dep = p_device_dependent[i];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*---------------------------------------------------------------------
+ * Find the one with the good name
+ *--------------------------------------------------------------------*/
+ for (size_t j=0; j < kernels.size(); ++j)
+ {
+ llvm::Function *func = kernels[j];
+
+ if (func->getName().str().compare(name) == 0)
+ {
+ found = true;
+ *errcode_ret = rs->addFunction(dep.device, func,
+ dep.linked_module);
+ if (*errcode_ret != CL_SUCCESS) return rs;
+ break;
+ }
+ }
+
+ /*---------------------------------------------------------------------
+ * Kernel unavailable for this device
+ *--------------------------------------------------------------------*/
+ if (!found)
+ {
+ *errcode_ret = CL_INVALID_KERNEL_NAME;
+ return rs;
+ }
+ else
+ {
+ kernelList.push_back(rs);
+ }
+ }
+
+ return rs;
+}
+
+Kernel * Program::createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret)
+{
+ Kernel *rs = NULL;
+ /*-------------------------------------------------------------------------
+ * We should never go here
+ *------------------------------------------------------------------------*/
+ if (p_device_dependent.size() == 0) return rs;
+
+
+ for (size_t i=0; i < kernelList.size(); i++)
+ {
+ if (kernelList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ return kernelList[i];
+ }
+ }
+ /* Now check the previously released list */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ if (kernelReleasedList[i]->p_name.compare(name) == 0)
+ {
+ *errcode_ret = CL_SUCCESS;
+ rs = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(rs);
+
+ return rs;
+ }
+ }
+
+ /*-------------------------------------------------------------------------
+ * Take the list of kernels for the first device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent &dep = p_device_dependent[0];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*-------------------------------------------------------------------------
+ * Create the kernel for each function name
+ * It returns an error if the signature is not the same for every device
+ * or if the kernel isn't found on all the devices.
+ *------------------------------------------------------------------------*/
+ *errcode_ret = CL_SUCCESS;
+
+ for (size_t i=0; i < kernels.size(); ++i)
+ {
+ cl_int result = CL_SUCCESS;
+ Kernel *kernel = createKernel(kernels[i]->getName().str(), &result);
+
+ if (result == CL_SUCCESS)
+ {
+ }
+ else
+ {
+ *errcode_ret = result;
+ delete kernel;
+ }
+ if (kernel->p_name.compare(name) == 0 && result == CL_SUCCESS)
+ {
+ rs = kernel;
+ *errcode_ret = result;
+ }
+ }
+
+ if (!rs && (*errcode_ret == CL_SUCCESS))
+ *errcode_ret = CL_INVALID_KERNEL_NAME;
+
+ return rs;
+}
+
+std::vector<Kernel *> Program::createKernels(cl_int *errcode_ret)
+{
+ std::vector<Kernel *> rs;
+ Kernel *kern = NULL;
+
+ /*-------------------------------------------------------------------------
+ * We should never go here
+ *------------------------------------------------------------------------*/
+ if (p_device_dependent.size() == 0) return rs;
+
+ /*
+ * Resurrect any released kernels back to the kernel list. This handles the
+ * case where clCreateKernelsInProgram() is asking only for a count of kernels in
+ * the currently built program. In that case, KernelList.size() must be the actual
+ * number of kernels compiled into the program (event if they were previously released).
+ */
+ for (size_t i=0; i < kernelReleasedList.size(); i++)
+ {
+ kern = kernelReleasedList[i];
+ kernelReleasedList.erase(kernelReleasedList.begin() + i);
+ kernelList.push_back(kern);
+ }
+
+ if (kernelList.size()) return kernelList;
+
+ /*-------------------------------------------------------------------------
+ * Take the list of kernels for the first device dependent
+ *------------------------------------------------------------------------*/
+ DeviceDependent &dep = p_device_dependent[0];
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ /*-------------------------------------------------------------------------
+ * Create the kernel for each function name
+ * It returns an error if the signature is not the same for every device
+ * or if the kernel isn't found on all the devices.
+ *------------------------------------------------------------------------*/
+ for (size_t i=0; i < kernels.size(); ++i)
+ {
+ cl_int result = CL_SUCCESS;
+ Kernel *kernel = createKernel(kernels[i]->getName().str(), &result);
+
+ if (result == CL_SUCCESS)
+ {
+ kernelList.push_back(kernel);
+ }
+ else
+ {
+ *errcode_ret = result;
+ delete kernel;
+ }
+ }
+
+ return kernelList;
+}
+
+cl_int Program::loadSources(cl_uint count, const char **strings,
+ const size_t *lengths)
+{
+ // Initialize
+ p_source = std::string("");
+
+ // Merge all strings into one big one
+ for (cl_uint i=0; i<count; ++i)
+ {
+ size_t len = 0;
+ const char *data = strings[i];
+
+ if (!data)
+ return CL_INVALID_VALUE;
+
+ // Get the length of the source
+ if (lengths && lengths[i])
+ len = lengths[i];
+ else
+ len = std::strlen(data);
+
+ // Remove trailing \0's, it's not good for sources (it can arise when
+ // the client application wrongly sets lengths
+ while (len > 0 && data[len-1] == 0)
+ len--;
+
+ // Merge the string
+ std::string part(data, len);
+ p_source += part;
+ }
+
+ /*-------------------------------------------------------------------------
+ * temporary for source file cacheing, remove from product releases
+ *------------------------------------------------------------------------*/
+ //source_cache::instance()->remember(p_source);
+
+ p_type = Source;
+ p_state = Loaded;
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::loadBinaries(const unsigned char **data, const size_t *lengths,
+ cl_int *binary_status, cl_uint num_devices,
+ DeviceInterface * const*device_list)
+{
+ // Set device infos
+ setDevices(num_devices, device_list);
+
+ // Load the data
+ for (cl_uint i=0; i<num_devices; ++i)
+ {
+ DeviceDependent &dep = deviceDependent(device_list[i]);
+ dep.unlinked_binary = std::string((const char *)data[i], lengths[i]);
+ dep.is_native_binary = true;
+
+ /*--------------------------------------------------------------------
+ * Loaded binary is either native code with LLVM bitcode embedded,
+ * or LLVM bitcode itself
+ *--------------------------------------------------------------------*/
+ std::string bitcode;
+ if (! dep.program->ExtractMixedBinary(&dep.unlinked_binary, &bitcode,
+ NULL))
+ {
+ bitcode = dep.unlinked_binary;
+ dep.is_native_binary = false;
+ }
+
+ const llvm::StringRef s_data(bitcode);
+ const llvm::StringRef s_name("<binary>");
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name, false);
+
+ if (!buffer)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ // Make a module of it
+ ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(buffer,
+ llvm::getGlobalContext());
+ if (ModuleOrErr) {
+ dep.linked_module = ModuleOrErr.get();
+ }
+ else {
+ dep.linked_module = NULL;
+ if (binary_status) binary_status[i] = CL_INVALID_VALUE;
+ return CL_INVALID_BINARY;
+ }
+
+ if (binary_status) binary_status[i] = CL_SUCCESS;
+ }
+
+ p_type = Binary;
+ p_state = Loaded;
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::build(const char *options,
+ void (CL_CALLBACK *pfn_notify)(cl_program program,
+ void *user_data),
+ void *user_data, cl_uint num_devices,
+ DeviceInterface * const*device_list)
+{
+ // If we've already built this program and are re-building
+ // (for example, with different user options) then clear out the
+ // device dependent information in preparation for building again.
+ if( p_state == Built) resetDeviceDependent();
+
+ p_state = Failed;
+
+ // Set device infos
+ if (!p_device_dependent.size())
+ {
+ setDevices(num_devices, device_list);
+ }
+
+ // ASW TODO - optimize to compile for each device type only once.
+ for (cl_uint i=0; i<p_device_dependent.size(); ++i)
+ {
+ DeviceDependent &dep = deviceDependent(device_list[i]);
+
+ // Do we need to compile the source for each device ?
+ if (p_type == Source)
+ {
+ // Load source
+ const llvm::StringRef s_data(p_source);
+ const llvm::StringRef s_name("<source>");
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name);
+
+ // Compile
+ int compile_result = dep.compiler->compile(options ? options : std::string(), buffer);
+ if (compile_result)
+ //if (! dep.compiler->compile(options ? options : std::string(),
+ // buffer) )
+ {
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+ if (compile_result == CL_INVALID_BUILD_OPTIONS)
+ return CL_INVALID_BUILD_OPTIONS;
+ else
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+
+ // Get module and its bitcode
+ dep.linked_module = dep.compiler->module();
+
+ llvm::raw_string_ostream ostream(dep.unlinked_binary);
+ llvm::WriteBitcodeToFile(dep.linked_module, ostream);
+ ostream.flush();
+ }
+
+ // Link p_linked_module with the stdlib if the device needs that
+ if (! dep.is_native_binary && dep.program->linkStdLib())
+ {
+ // Load the stdlib bitcode
+ const llvm::StringRef s_data(embed_stdlib_c_bc,
+ sizeof(embed_stdlib_c_bc) - 1);
+ const llvm::StringRef s_name("stdlib.bc");
+ std::string errMsg;
+
+ llvm::MemoryBuffer *buffer = llvm::MemoryBuffer::getMemBuffer(
+ s_data, s_name, false);
+
+ if (!buffer)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ ErrorOr<Module *> ModuleOrErr =
+ parseBitcodeFile(buffer, llvm::getGlobalContext());
+ Module *stdlib = NULL;
+ if (ModuleOrErr) {
+ stdlib = ModuleOrErr.get();
+ }
+ else {
+ std::error_code EC = ModuleOrErr.getError();
+ errMsg = EC.message();
+ }
+
+ // Link
+ if (!stdlib ||
+ llvm::Linker::LinkModules(dep.linked_module, stdlib,
+ llvm::Linker::DestroySource, &errMsg))
+ {
+ dep.compiler->appendLog("link error: ");
+ dep.compiler->appendLog(errMsg);
+ dep.compiler->appendLog("\n");
+
+ // DEBUG
+ std::cout << dep.compiler->log() << std::endl;
+
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+ }
+
+ if (! dep.is_native_binary)
+ {
+ // Get list of kernels to strip other unused functions
+ std::vector<const char *> api;
+ std::vector<std::string> api_s; // Needed to keep valid data in api
+ const std::vector<llvm::Function *> &kernels = kernelFunctions(dep);
+
+ for (size_t j=0; j<kernels.size(); ++j)
+ {
+ std::string s = kernels[j]->getName().str();
+ api_s.push_back(s);
+ api.push_back(s.c_str());
+ }
+
+ // determine if module has barrier() function calls
+ bool hasBarrier = false;
+ llvm::CallInst* call;
+ for (llvm::Module::iterator F = dep.linked_module->begin(),
+ EF = dep.linked_module->end(); !hasBarrier && F != EF; ++F)
+ for (llvm::inst_iterator I = inst_begin(*F),
+ E = inst_end(*F); I != E; ++I)
+ {
+ if (!(call = llvm::dyn_cast<llvm::CallInst>(&*I))) continue;
+ if (!call->getCalledFunction()) continue;
+ std::string name(call->getCalledFunction()->getName());
+ if (name == "barrier")
+ {
+ hasBarrier = true;
+ break;
+ }
+ }
+
+ // Optimize code
+ llvm::PassManager *manager = new llvm::PassManager();
+
+ // Common passes (primary goal : remove unused stdlib functions)
+ manager->add(llvm::createTypeBasedAliasAnalysisPass());
+ manager->add(llvm::createBasicAliasAnalysisPass());
+ manager->add(llvm::createInternalizePass(api));
+ manager->add(llvm::createIPSCCPPass());
+ manager->add(llvm::createGlobalOptimizerPass());
+ manager->add(llvm::createConstantMergePass());
+ manager->add(llvm::createAlwaysInlinerPass());
+
+ dep.program->createOptimizationPasses(manager,
+ dep.compiler->optimize(), hasBarrier);
+
+ manager->add(llvm::createGlobalDCEPass());
+
+ manager->run(*dep.linked_module);
+ delete manager;
+ }
+
+ // Now that the LLVM module is built, build the device-specific
+ // representation
+ if (!dep.program->build(dep.linked_module, &dep.unlinked_binary))
+ {
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+ }
+
+ // TODO: Asynchronous compile
+ if (pfn_notify)
+ pfn_notify((cl_program)this, user_data);
+
+ p_state = Built;
+
+ return CL_SUCCESS;
+}
+
+Program::Type Program::type() const
+{
+ return p_type;
+}
+
+Program::State Program::state() const
+{
+ return p_state;
+}
+
+cl_int Program::info(cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+ llvm::SmallVector<size_t, 4> binary_sizes;
+ llvm::SmallVector<DeviceInterface *, 4> devices;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_PROGRAM_NUM_DEVICES:
+ // Use devices associated with any built kernels, otherwise use
+ // the devices associated with the program context
+ if (p_device_dependent.size() != 0)
+ { SIMPLE_ASSIGN(cl_uint, p_device_dependent.size()); }
+ else
+ return ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ param_value_size, param_value, param_value_size_ret);
+ break;
+
+ case CL_PROGRAM_DEVICES:
+ // Use devices associated with any built kernels, otherwise use
+ // the devices associated with the program context
+ if (p_device_dependent.size() != 0)
+ {
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+
+ devices.push_back(dep.device);
+ }
+
+ value = devices.data();
+ value_length = devices.size() * sizeof(DeviceInterface *);
+ }
+ else
+ return ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ param_value_size, param_value, param_value_size_ret);
+ break;
+
+ case CL_PROGRAM_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_PROGRAM_SOURCE:
+ MEM_ASSIGN(p_source.size() + 1, p_source.c_str());
+ break;
+
+ case CL_PROGRAM_BINARY_SIZES:
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+
+ binary_sizes.push_back(dep.unlinked_binary.size());
+ }
+
+ value = binary_sizes.data();
+ value_length = binary_sizes.size() * sizeof(size_t);
+ break;
+
+ case CL_PROGRAM_BINARIES:
+ {
+ // Special case : param_value points to an array of p_num_devices
+ // application-allocated unsigned char* pointers. Check it's good
+ // and std::memcpy the data
+
+ unsigned char **binaries = (unsigned char **)param_value;
+ value_length = p_device_dependent.size() * sizeof(unsigned char *);
+
+ if (param_value && param_value_size >= value_length)
+ for (size_t i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &dep = p_device_dependent[i];
+ unsigned char *dest = binaries[i];
+
+ if (!dest)
+ continue;
+
+ std::memcpy(dest, dep.unlinked_binary.data(),
+ dep.unlinked_binary.size());
+ }
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ return CL_SUCCESS;
+ }
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
+
+cl_int Program::buildInfo(DeviceInterface *device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ const void *value = 0;
+ size_t value_length = 0;
+ const DeviceDependent &dep = deviceDependent(device);
+
+ union {
+ cl_build_status cl_build_status_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_BUILD_STATUS:
+ switch (p_state)
+ {
+ case Empty:
+ case Loaded:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_NONE);
+ break;
+ case Built:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_SUCCESS);
+ break;
+ case Failed:
+ SIMPLE_ASSIGN(cl_build_status, CL_BUILD_ERROR);
+ break;
+ // TODO: CL_BUILD_IN_PROGRESS
+ }
+ break;
+
+ case CL_PROGRAM_BUILD_OPTIONS:
+ value = dep.compiler->options().c_str();
+ value_length = dep.compiler->options().size() + 1;
+ break;
+
+ case CL_PROGRAM_BUILD_LOG:
+ value = dep.compiler->log().c_str();
+ value_length = dep.compiler->log().size() + 1;
+ break;
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/program.h b/src/core/program.h
new file mode 100644
index 0000000..a06b452
--- /dev/null
+++ b/src/core/program.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/program.h
+ * \brief Program
+ */
+
+#ifndef __PROGRAM_H__
+#define __PROGRAM_H__
+
+#include "object.h"
+
+#include <CL/cl.h>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+ class MemoryBuffer;
+ class Module;
+ class Function;
+}
+
+namespace Coal
+{
+
+class Context;
+class Compiler;
+class DeviceInterface;
+class DeviceProgram;
+class Kernel;
+
+/**
+ * \brief Program object
+ *
+ * This class compiles and links a source or binaries into LLVM modules for each
+ * \c Coal::DeviceInterface for which the program is built.
+ *
+ * It then contains functions to get the list of kernels available in the
+ * program, using \c Coal::Kernel objects.
+ */
+class Program : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ */
+ Program(Context *ctx);
+ ~Program();
+
+ /**
+ * \brief Program type
+ */
+ enum Type
+ {
+ Invalid, /*!< Invalid or unknown, type of a program not already loaded */
+ Source, /*!< Program made of sources that must be compiled and linked */
+ Binary /*!< Program made of pre-built binaries that only need to be (transformed)/linked */
+ };
+
+ /**
+ * \brief Program state
+ */
+ enum State
+ {
+ Empty, /*!< Just created */
+ Loaded, /*!< Source or binary loaded */
+ Built, /*!< Built */
+ Failed, /*!< Build failed */
+ };
+
+ /**
+ * \brief Load sources into the program
+ *
+ * This function loads the source-code given in \p strings into the
+ * program and sets its type to \c Source.
+ *
+ * \param count number of strings in \p strings
+ * \param strings array of pointers to strings, either null-terminated
+ * or of length given in \p lengths
+ * \param lengths lengths of the strings. If a field is 0, the
+ * corresponding string is null-terminated. If \p lengths is
+ * 0, all the strings are null-terminated
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int loadSources(cl_uint count, const char **strings,
+ const size_t *lengths);
+
+ /**
+ * \brief Load binaries into the program
+ *
+ * This function allows client application to load a source, retrieve
+ * binaries using \c buildInfo(), and then re-create the same program
+ * (after a restart for example) by giving it a precompiled binary.
+ *
+ * This function loads the binaries for each device and parse them into
+ * LLVM modules, then sets the program type to \c Binary or
+ * \c NativeBinary.
+ *
+ * \param data array of pointers to binaries, one for each device
+ * \param lengths lengths of the binaries pointed to by \p data
+ * \param binary_status array that will be filled by this function with
+ * the status of each loaded binary (\c CL_SUCCESS if success)
+ * \param num_devices number of devices for which a binary is loaded
+ * \param device_list list of devices for which the binaries are loaded
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int loadBinaries(const unsigned char **data, const size_t *lengths,
+ cl_int *binary_status, cl_uint num_devices,
+ DeviceInterface * const*device_list);
+
+ /**
+ * \brief Build the program
+ *
+ * This function compiles the sources, if any, and then link the
+ * resulting binaries if the devices for which they are compiled asks
+ * \c Coal::Program to do so, using \c Coal::DeviceProgram::linkStdLib().
+ *
+ * \param options options to pass to the compiler, see the OpenCL
+ * specification.
+ * \param pfn_notify callback function called at the end of the build
+ * \param user_data user data given to \p pfn_notify
+ * \param num_devices number of devices for which binaries are being
+ * built. If it's a source-based program, this can be 0.
+ * \param device_list list of devices for which the program will be built.
+ * \return \c CL_SUCCESS if success, an error code otherwise
+ */
+ cl_int build(const char *options,
+ void (CL_CALLBACK *pfn_notify)(cl_program program,
+ void *user_data),
+ void *user_data, cl_uint num_devices,
+ DeviceInterface * const*device_list);
+
+ Type type() const; /*!< \brief Type of the program */
+ State state() const; /*!< \brief State of the program */
+
+ /**
+ * \brief Create a kernel given a \p name
+ * \param name name of the kernel to be created
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return a \c Coal::Kernel object corresponding to the given \p name
+ */
+ Kernel *createKernel(const std::string &name, cl_int *errcode_ret);
+
+ /**
+ * \brief Create kernels of the program and return given a \p name
+ * \param name name of the kernel to be returned
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return a \c Coal::Kernel object corresponding to the given \p name
+ */
+ Kernel *createKernelsAndReturnKernel(const std::string &name, cl_int *errcode_ret);
+
+ /**
+ * \brief Create all the kernels of the program
+ * \param errcode_ret return code (\c CL_SUCCESS if success)
+ * \return the list of \c Coal::Kernel objects of this program
+ */
+ std::vector<Kernel *> createKernels(cl_int *errcode_ret);
+
+ /**
+ * \brief Device-specific program
+ * \param device device for which the device-specific program is needed
+ * \return the device-specific program requested, 0 if not found
+ */
+ DeviceProgram *deviceDependentProgram(DeviceInterface *device) const;
+ std::string deviceDependentCompilerOptions(DeviceInterface *device) const;
+
+ /**
+ * \brief Get information about this program
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_program_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ /**
+ * \brief Get build info about this program (log, binaries, etc)
+ * \copydetails Coal::DeviceInterface::info
+ * \param device \c Coal::DeviceInterface for which info is needed
+ */
+ cl_int buildInfo(DeviceInterface *device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ std::string source() { return p_source; }
+
+ std::vector<Kernel *> kernelList;
+ std::vector<Kernel *> kernelReleasedList;
+
+ private:
+ Type p_type;
+ State p_state;
+ std::string p_source;
+
+ struct DeviceDependent
+ {
+ DeviceInterface * device;
+ DeviceProgram * program;
+ std::string unlinked_binary;
+ bool is_native_binary; // llvm kernel bitcode vs final native binary
+ llvm::Module * linked_module;
+ Compiler * compiler;
+ };
+
+ std::vector<DeviceDependent> p_device_dependent;
+ DeviceDependent p_null_device_dependent;
+
+ void setDevices(cl_uint num_devices, DeviceInterface * const*devices);
+ void resetDeviceDependent();
+ DeviceDependent &deviceDependent(DeviceInterface *device);
+ const DeviceDependent &deviceDependent(DeviceInterface *device) const;
+ std::vector<llvm::Function *> kernelFunctions(DeviceDependent &dep);
+};
+
+}
+
+struct _cl_program : public Coal::Program
+{};
+
+#endif
diff --git a/src/core/propertylist.h b/src/core/propertylist.h
new file mode 100644
index 0000000..8d32397
--- /dev/null
+++ b/src/core/propertylist.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file propertylist.h
+ * \brief Helper macros for \c info() functions
+ *
+ * The OpenCL API is full of functions like \c clGetXXXInfo(). They all take
+ * the same arguments and are handled the same way. This file contains macros
+ * easing the implementation of these info functions.
+ *
+ * One info function, using these macros, looks like that:
+ *
+ * \code
+ * cl_int Foo::info(cl_foo_info param_name,
+ * size_t param_value_size,
+ * void *param_value,
+ * size_t *param_value_size_ret) const
+ * {
+ * void *value = 0;
+ * size_t value_length = 0;
+ *
+ * union {
+ * cl_uint cl_uint_var;
+ * cl_context cl_context_var;
+ * };
+ *
+ * switch (param_name)
+ * {
+ * case CL_UINT_PARAM:
+ * SIMPLE_ASSIGN(cl_uint, the_value);
+ * break;
+ * case CL_CONTEXT_PARAM:
+ * SIMPLE_ASSIGN(cl_context, a_call());
+ * break;
+ * case CL_STRING_PARAM:
+ * STRING_ASSIGN("This is a string");
+ * break;
+ * case CL_BINARY_PARAM:
+ * MEM_ASSIGN(sizeof(something), something);
+ * break;
+ * default:
+ * return CL_INVALID_VALUE;
+ * }
+ *
+ * if (param_value && param_value_size < value_length)
+ * return CL_INVALID_VALUE;
+ *
+ * if (param_value_size_ret)
+ * *param_value_size_ret = value_length;
+ *
+ * if (param_value)
+ * std::memcpy(param_value, value, value_length);
+ *
+ * return CL_SUCCESS;
+ * }
+ * \endcode
+ */
+
+#ifndef __PROPERTYLIST_H__
+#define __PROPERTYLIST_H__
+
+/**
+ * \brief Assign a value of a given type to the return value
+ * \param type type of the argument
+ * \param _value value to assign
+ */
+#define SIMPLE_ASSIGN(type, _value) do { \
+ value_length = sizeof(type); \
+ type##_var = (type)_value; \
+ value = & type##_var; \
+} while (0);
+
+/**
+ * \brief Assign a string to the return value
+ * \param string the string to assign, as a constant
+ */
+#define STRING_ASSIGN(string) do { \
+ static const char str[] = string; \
+ value_length = sizeof(str); \
+ value = (void *)str; \
+} while (0);
+
+/**
+ * \brief Assign a memory buffer to the return value
+ * \note the buffer must remain valid after the end of the \c info() call
+ * \param size size of the buffer
+ * \param buf buffer (of type <tt>void *</tt> for instance)
+ */
+#define MEM_ASSIGN(size, buf) do { \
+ value_length = size; \
+ value = (void *)buf; \
+} while (0);
+
+#endif
diff --git a/src/core/sampler.cpp b/src/core/sampler.cpp
new file mode 100644
index 0000000..71fca86
--- /dev/null
+++ b/src/core/sampler.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file core/sampler.cpp
+ * \brief Sampler
+ */
+
+#include "sampler.h"
+#include "context.h"
+#include "deviceinterface.h"
+#include "propertylist.h"
+
+#include <cstring>
+#include <cstdlib>
+
+using namespace Coal;
+
+Sampler::Sampler(Context *ctx,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int *errcode_ret)
+: Object(Object::T_Sampler, ctx), p_bitfield(0)
+{
+ if (normalized_coords)
+ p_bitfield |= CLK_NORMALIZED_COORDS_TRUE;
+ else
+ p_bitfield |= CLK_NORMALIZED_COORDS_FALSE;
+
+ switch (addressing_mode)
+ {
+ case CL_ADDRESS_NONE:
+ p_bitfield |= CLK_ADDRESS_NONE;
+ break;
+
+ case CL_ADDRESS_MIRRORED_REPEAT:
+ p_bitfield |= CLK_ADDRESS_MIRRORED_REPEAT;
+ break;
+
+ case CL_ADDRESS_REPEAT:
+ p_bitfield |= CLK_ADDRESS_REPEAT;
+ break;
+
+ case CL_ADDRESS_CLAMP_TO_EDGE:
+ p_bitfield |= CLK_ADDRESS_CLAMP_TO_EDGE;
+ break;
+
+ case CL_ADDRESS_CLAMP:
+ p_bitfield |= CLK_ADDRESS_CLAMP;
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ switch (filter_mode)
+ {
+ case CL_FILTER_NEAREST:
+ p_bitfield |= CLK_FILTER_NEAREST;
+ break;
+
+ case CL_FILTER_LINEAR:
+ p_bitfield |= CLK_FILTER_LINEAR;
+ break;
+
+ default:
+ *errcode_ret = CL_INVALID_VALUE;
+ return;
+ }
+
+ // Check that images are available on all the devices
+ *errcode_ret = checkImageAvailability();
+}
+
+Sampler::Sampler(Context *ctx, unsigned int bitfield)
+: Object(Object::T_Sampler, ctx), p_bitfield(bitfield)
+{
+ checkImageAvailability();
+}
+
+cl_int Sampler::checkImageAvailability() const
+{
+ cl_uint num_devices;
+ DeviceInterface **devices;
+ cl_int rs;
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_NUM_DEVICES,
+ sizeof(unsigned int),
+ &num_devices, 0);
+
+ if (rs != CL_SUCCESS)
+ return rs;
+
+ devices = (DeviceInterface **)std::malloc(num_devices *
+ sizeof(DeviceInterface *));
+
+ if (!devices)
+ {
+ return CL_OUT_OF_HOST_MEMORY;
+ }
+
+ rs = ((Context *)parent())->info(CL_CONTEXT_DEVICES,
+ num_devices * sizeof(DeviceInterface *),
+ devices, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ for (unsigned int i=0; i<num_devices; ++i)
+ {
+ cl_bool image_support;
+
+ rs = devices[i]->info(CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
+ &image_support, 0);
+
+ if (rs != CL_SUCCESS)
+ {
+ std::free((void *)devices);
+ return rs;
+ }
+
+ if (!image_support)
+ {
+ std::free((void *)devices);
+ return CL_INVALID_OPERATION;
+ }
+ }
+
+ std::free((void *)devices);
+
+ return CL_SUCCESS;
+}
+
+unsigned int Sampler::bitfield() const
+{
+ return p_bitfield;
+}
+
+cl_int Sampler::info(cl_sampler_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const
+{
+ void *value = 0;
+ size_t value_length = 0;
+
+ union {
+ cl_uint cl_uint_var;
+ cl_context cl_context_var;
+ cl_bool cl_bool_var;
+ cl_addressing_mode cl_addressing_mode_var;
+ cl_filter_mode cl_filter_mode_var;
+ };
+
+ switch (param_name)
+ {
+ case CL_SAMPLER_REFERENCE_COUNT:
+ SIMPLE_ASSIGN(cl_uint, references());
+ break;
+
+ case CL_SAMPLER_CONTEXT:
+ SIMPLE_ASSIGN(cl_context, parent());
+ break;
+
+ case CL_SAMPLER_NORMALIZED_COORDS:
+ if (p_bitfield & CLK_NORMALIZED_COORDS_MASK)
+ SIMPLE_ASSIGN(cl_bool, true)
+ else
+ SIMPLE_ASSIGN(cl_bool, false);
+ break;
+
+ case CL_SAMPLER_ADDRESSING_MODE:
+ switch (p_bitfield & CLK_ADDRESS_MODE_MASK)
+ {
+ case CLK_ADDRESS_CLAMP:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP);
+ break;
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_CLAMP_TO_EDGE);
+ break;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_MIRRORED_REPEAT);
+ break;
+ case CLK_ADDRESS_REPEAT:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_REPEAT);
+ break;
+ case CLK_ADDRESS_NONE:
+ SIMPLE_ASSIGN(cl_addressing_mode, CL_ADDRESS_NONE);
+ break;
+ }
+ break;
+
+ case CL_SAMPLER_FILTER_MODE:
+ switch (p_bitfield & CLK_FILTER_MASK)
+ {
+ case CLK_FILTER_LINEAR:
+ SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_LINEAR);
+ break;
+ case CLK_FILTER_NEAREST:
+ SIMPLE_ASSIGN(cl_filter_mode, CL_FILTER_NEAREST);
+ break;
+ }
+
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ if (param_value && param_value_size < value_length)
+ return CL_INVALID_VALUE;
+
+ if (param_value_size_ret)
+ *param_value_size_ret = value_length;
+
+ if (param_value)
+ std::memcpy(param_value, value, value_length);
+
+ return CL_SUCCESS;
+}
diff --git a/src/core/sampler.h b/src/core/sampler.h
new file mode 100644
index 0000000..1ff1f1f
--- /dev/null
+++ b/src/core/sampler.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file sampler.h
+ * \brief Sampler object
+ */
+
+#ifndef __SAMPLER_H__
+#define __SAMPLER_H__
+
+#include <CL/cl.h>
+#include "object.h"
+
+// WARNING: Keep in sync with stdlib.h
+
+#define CLK_NORMALIZED_COORDS_FALSE 0x00000000
+#define CLK_NORMALIZED_COORDS_TRUE 0x00000001
+#define CLK_ADDRESS_NONE 0x00000000
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x00000010
+#define CLK_ADDRESS_REPEAT 0x00000020
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x00000030
+#define CLK_ADDRESS_CLAMP 0x00000040
+#define CLK_FILTER_NEAREST 0x00000000
+#define CLK_FILTER_LINEAR 0x00000100
+
+#define CLK_NORMALIZED_COORDS_MASK 0x0000000f
+#define CLK_ADDRESS_MODE_MASK 0x000000f0
+#define CLK_FILTER_MASK 0x00000f00
+
+namespace Coal
+{
+
+class Context;
+
+/**
+ * \brief Sampler
+ *
+ * This object doesn't do anything intersting, it only converts a set of
+ * host OpenCL constants to constants that will be used by the kernels and
+ * the image reading and writing built-in functions.
+ */
+class Sampler : public Object
+{
+ public:
+ /**
+ * \brief Constructor
+ * \param ctx parent \c Coal::Context
+ * \param normalized_coords true if the coords given to the built-in
+ * image functions are normalized, false otherwise
+ * \param addressing_mode addressing mode used to read images
+ * \param filter_mode filter mode used to read images
+ * \param errcode_ret return code (\c CL_SUCCESS if all is good)
+ */
+ Sampler(Context *ctx,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int *errcode_ret);
+
+ /**
+ * \brief Simpler constructor
+ * \param ctx parent \c Coal::Context
+ * \param bitfield bitfield already calculated
+ */
+ Sampler(Context *ctx,
+ unsigned int bitfield);
+
+ unsigned int bitfield() const; /*!< \brief Bitfield value usable by the kernels */
+
+ /**
+ * \brief Get information about the sampler
+ * \copydetails Coal::DeviceInterface::info
+ */
+ cl_int info(cl_sampler_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret) const;
+
+ private:
+ unsigned int p_bitfield;
+
+ cl_int checkImageAvailability() const;
+};
+
+}
+
+struct _cl_sampler : public Coal::Sampler
+{};
+
+#endif
diff --git a/src/core/util.cpp b/src/core/util.cpp
new file mode 100644
index 0000000..afeb564
--- /dev/null
+++ b/src/core/util.cpp
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**
+ * \file core/util.c
+ * \brief misc utils
+ */
+
+#include <stdint.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+
+/******************************************************************************
+* Parse first line in a file, read integer immediately following a string
+******************************************************************************/
+uint32_t parse_file_line_value(const char *fname, const char *sname,
+ uint32_t default_val)
+{
+ uint32_t val = default_val;
+ FILE *fp = NULL;
+ char *line = NULL;
+ char *str = NULL;
+ size_t len = 0;
+
+ if ((fp = fopen(fname, "r")) == NULL) return val;
+ if (getline(&line, &len, fp) != -1)
+ {
+ if ((str = strstr(line, sname)) != NULL)
+ {
+ str += strlen(sname);
+ while(!isdigit(*str) && *str != '\0') str++;
+ if (*str != '\0') val = atoi(str);
+ }
+ }
+
+ if (fp != NULL) fclose(fp);
+ if (line != NULL) free(line);
+ return val;
+}
+
diff --git a/src/core/util.h b/src/core/util.h
new file mode 100644
index 0000000..f2c1609
--- /dev/null
+++ b/src/core/util.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+/**
+ * \file core/util.h
+ * \brief misc utils
+ */
+
+#ifndef _UTIL_H
+#define _UTIL_H
+
+// Parse first line in a file, read integer immediately following a string
+uint32_t parse_file_line_value(const char *fname, const char *sname,
+ uint32_t default_val);
+
+#endif // _UTIL_H
+
diff --git a/src/llvmopencl/AllocasToEntry.cc b/src/llvmopencl/AllocasToEntry.cc
new file mode 100644
index 0000000..79bbe63
--- /dev/null
+++ b/src/llvmopencl/AllocasToEntry.cc
@@ -0,0 +1,74 @@
+// Header for AllocasToEntry, an LLVM pass to move allocas to the function
+// entry node.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#ifdef LLVM_3_2
+# include <llvm/Instructions.h>
+#else
+# include <llvm/IR/Instructions.h>
+#endif
+
+#include "AllocasToEntry.h"
+
+namespace pocl {
+
+using namespace llvm;
+
+namespace {
+ static
+ RegisterPass<pocl::AllocasToEntry> X("allocastoentry",
+ "Move allocas to the function entry node.");
+}
+
+char AllocasToEntry::ID = 0;
+
+
+AllocasToEntry::AllocasToEntry() : FunctionPass(ID)
+{
+}
+
+bool
+AllocasToEntry::runOnFunction(Function &F)
+{
+ // This solves problem with dynamic stack objects that are
+ // not supported by some targets (TCE).
+ Function::iterator I = F.begin();
+ Instruction *firstInsertionPt = (I++)->getFirstInsertionPt();
+
+ bool changed = false;
+ for (Function::iterator E = F.end(); I != E; ++I) {
+ for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+ AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+ if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+ allocaInst->moveBefore(firstInsertionPt);
+ changed = true;
+ }
+ }
+ }
+ return changed;
+}
+
+}
diff --git a/src/llvmopencl/AllocasToEntry.h b/src/llvmopencl/AllocasToEntry.h
new file mode 100644
index 0000000..a92fa14
--- /dev/null
+++ b/src/llvmopencl/AllocasToEntry.h
@@ -0,0 +1,49 @@
+// Header for AllocasToEntry, an LLVM pass to move allocas to the function
+// entry node.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_ALLOCAS_TO_ENTRY_H
+#define _POCL_ALLOCAS_TO_ENTRY_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace pocl {
+ class AllocasToEntry : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ AllocasToEntry();
+ virtual ~AllocasToEntry() {};
+
+ virtual bool runOnFunction(llvm::Function &F);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Barrier.h b/src/llvmopencl/Barrier.h
new file mode 100644
index 0000000..e1b612f
--- /dev/null
+++ b/src/llvmopencl/Barrier.h
@@ -0,0 +1,121 @@
+// Class for barrier instructions, modelled as a CallInstr.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include <cstdio>
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "llvm/Support/Casting.h"
+
+#define BARRIER_FUNCTION_NAME "barrier"
+
+namespace pocl {
+
+ class Barrier : public llvm::CallInst {
+
+ public:
+ static void GetBarriers(llvm::SmallVectorImpl<Barrier *> &B,
+ llvm::Module &M) {
+ llvm::Function *F = M.getFunction(BARRIER_FUNCTION_NAME);
+ if (F != NULL) {
+ for (llvm::Function::use_iterator i = F->use_begin(), e = F->use_end();
+ i != e; ++i)
+ B.push_back(llvm::cast<Barrier>(*i));
+ }
+ }
+ /**
+ * Creates a new barrier before the given instruction.
+ *
+ * If there was already a barrier there, returns the old one.
+ */
+ static Barrier *Create(llvm::Instruction *InsertBefore) {
+ llvm::Module *M = InsertBefore->getParent()->getParent()->getParent();
+
+ if (InsertBefore != &InsertBefore->getParent()->front() &&
+ llvm::isa<Barrier>(InsertBefore->getPrevNode()))
+ return llvm::cast<Barrier>(InsertBefore->getPrevNode());
+
+ llvm::Type *Int32Type = llvm::Type::getInt32Ty(M->getContext());
+ llvm::Function *F = llvm::cast<llvm::Function>
+ (M->getOrInsertFunction(BARRIER_FUNCTION_NAME,
+ llvm::Type::getVoidTy(M->getContext()),
+ Int32Type,
+ NULL));
+ llvm::SmallVector<llvm::Value *, 4> argsarray;
+ argsarray.push_back(llvm::ConstantInt::get(Int32Type, 0));
+ llvm::ArrayRef<llvm::Value *> args(argsarray);
+ return llvm::cast<pocl::Barrier>
+ (llvm::CallInst::Create(F, args, "", InsertBefore));
+ }
+ static bool classof(const Barrier *) { return true; };
+ static bool classof(const llvm::CallInst *C) {
+ return C->getCalledFunction() != NULL &&
+ C->getCalledFunction()->getName() == BARRIER_FUNCTION_NAME;
+ }
+ static bool classof(const Instruction *I) {
+ return (llvm::isa<llvm::CallInst>(I) &&
+ classof(llvm::cast<llvm::CallInst>(I)));
+ }
+ static bool classof(const User *U) {
+ return (llvm::isa<Instruction>(U) &&
+ classof(llvm::cast<llvm::Instruction>(U)));
+ }
+
+
+ static bool hasOnlyBarrier(const llvm::BasicBlock *bb)
+ {
+ return endsWithBarrier(bb) && bb->size() == 2;
+ }
+
+ static bool hasBarrier(const llvm::BasicBlock *bb)
+ {
+ for (llvm::BasicBlock::const_iterator i = bb->begin(), e = bb->end();
+ i != e; ++i)
+ {
+ if (llvm::isa<Barrier>(i)) return true;
+ }
+ return false;
+ }
+
+ // returns true in case the given basic block ends with a barrier,
+ // that is, contains only a branch instruction after a barrier call
+ static bool endsWithBarrier(const llvm::BasicBlock *bb)
+ {
+ const llvm::TerminatorInst *t = bb->getTerminator();
+ if (t == NULL) return false;
+ return bb->size() > 1 && t->getPrevNode() != NULL &&
+ llvm::isa<Barrier>(t->getPrevNode());
+ }
+ };
+
+}
+
diff --git a/src/llvmopencl/BarrierBlock.cc b/src/llvmopencl/BarrierBlock.cc
new file mode 100644
index 0000000..d254fa6
--- /dev/null
+++ b/src/llvmopencl/BarrierBlock.cc
@@ -0,0 +1,73 @@
+// Class for a basic block that just contains a barrier.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "BarrierBlock.h"
+#include "Barrier.h"
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/Instructions.h"
+#endif
+#include <cassert>
+
+using namespace llvm;
+using namespace pocl;
+
+static bool
+verify(const BasicBlock *B);
+
+bool
+BarrierBlock::classof(const BasicBlock *B)
+{
+ if ((B->size() == 2) &&
+ isa<Barrier> (&B->front())) {
+ assert(verify(B));
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+verify(const BasicBlock *B)
+{
+ assert((B->size() == 2) && "Barriers blocks should have no functionality!");
+ // const Instruction *barrier = B->getFirstNonPHI();
+ // assert(isa<Barrier>(barrier) && "Barriers blocks should have no functionality!");
+ // assert(B->getTerminator()->getPrevNode() == barrier &&
+ // "Barriers blocks should have no functionality!");
+#if 1 // We want to allow barriers with more than one predecessors (?)
+ // (for loop header barriers).
+ assert(((B->getSinglePredecessor() != NULL) ||
+ (B == &(B->getParent()->front()))) &&
+ "Barrier blocks should have exactly one predecessor (except entry barrier)!");
+#endif
+#if 0 // We want to allow barriers with more than one successor (for latch barriers).
+ assert((B->getTerminator()->getNumSuccessors() <= 1) &&
+ "Barrier blocks should have one successor, or zero for exit barriers!");
+#endif
+ assert(isa<Barrier>(B->front()));
+
+ return true;
+}
+
diff --git a/src/llvmopencl/BarrierBlock.h b/src/llvmopencl/BarrierBlock.h
new file mode 100644
index 0000000..6246751
--- /dev/null
+++ b/src/llvmopencl/BarrierBlock.h
@@ -0,0 +1,44 @@
+// Class for a basic block that just contains a barrier.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/BasicBlock.h"
+#else
+#include "llvm/IR/BasicBlock.h"
+#endif
+
+#ifndef _POCL_BARRIER_BLOCK_H
+#define _POCL_BARRIER_BLOCK_H
+
+namespace pocl {
+
+ class BarrierBlock : public llvm::BasicBlock {
+
+ public:
+ static bool classof(const BarrierBlock *) { return true; };
+ static bool classof(const llvm::BasicBlock *B);
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/BarrierTailReplication.cc b/src/llvmopencl/BarrierTailReplication.cc
new file mode 100644
index 0000000..12bac74
--- /dev/null
+++ b/src/llvmopencl/BarrierTailReplication.cc
@@ -0,0 +1,421 @@
+// LLVM function pass to replicate barrier tails (successors to barriers).
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "BarrierTailReplication.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#endif
+
+#include <iostream>
+#include <algorithm>
+
+using namespace llvm;
+using namespace pocl;
+
+//#define DEBUG_BARRIER_REPL
+
+static bool block_has_barrier(const BasicBlock *bb);
+
+namespace {
+ static
+ RegisterPass<BarrierTailReplication> X("barriertails",
+ "Barrier tail replication pass");
+}
+
+char BarrierTailReplication::ID = 0;
+
+void
+BarrierTailReplication::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+ AU.addRequired<LoopInfo>();
+ AU.addPreserved<LoopInfo>();
+}
+
+bool
+BarrierTailReplication::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### BTR on " << F.getName().str() << std::endl;
+#endif
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+
+ DT->verifyAnalysis();
+ LI->verifyAnalysis();
+
+ /* The created tails might contain PHI nodes with operands
+ referring to the non-predecessor (split point) BB.
+ These must be cleaned to avoid breakage later on.
+ */
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ llvm::BasicBlock *bb = i;
+ changed |= CleanupPHIs(bb);
+ }
+
+ return changed;
+}
+
+bool
+BarrierTailReplication::ProcessFunction(Function &F)
+{
+ BasicBlockSet processed_bbs;
+
+ return FindBarriersDFS(&F.getEntryBlock(), processed_bbs);
+}
+
+
+// Recursively (depht-first) look for barriers in all possible
+// execution paths starting on entry, replicating the barrier
+// successors to ensure there is a separate function exit BB
+// for each combination of traversed barriers. The set
+// processed_bbs stores the
+bool
+BarrierTailReplication::FindBarriersDFS(BasicBlock *bb,
+ BasicBlockSet &processed_bbs)
+{
+ bool changed = false;
+
+ // Check if we already visited this BB (to avoid
+ // infinite recursion in case of unbarriered loops).
+ if (processed_bbs.count(bb) != 0)
+ return changed;
+
+ processed_bbs.insert(bb);
+
+ if (block_has_barrier(bb)) {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### block " << bb->getName().str() << " has barrier, RJS" << std::endl;
+#endif
+ BasicBlockSet processed_bbs_rjs;
+ changed = ReplicateJoinedSubgraphs(bb, bb, processed_bbs_rjs);
+ }
+
+ TerminatorInst *t = bb->getTerminator();
+
+ // Find barriers in the successors (depth first).
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i)
+ changed |= FindBarriersDFS(t->getSuccessor(i), processed_bbs);
+
+ return changed;
+}
+
+
+// Only replicate those parts of the subgraph that are not
+// dominated by a (barrier) basic block, to avoid excesive
+// (and confusing) code replication.
+bool
+BarrierTailReplication::ReplicateJoinedSubgraphs(BasicBlock *dominator,
+ BasicBlock *subgraph_entry,
+ BasicBlockSet &processed_bbs)
+{
+ bool changed = false;
+
+ assert(DT->dominates(dominator, subgraph_entry));
+
+ Function *f = dominator->getParent();
+
+ TerminatorInst *t = subgraph_entry->getTerminator();
+ for (int i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *b = t->getSuccessor(i);
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### traversing from " << subgraph_entry->getName().str()
+ << " to " << b->getName().str() << std::endl;
+#endif
+
+ // Check if we already handled this BB and all its branches.
+ if (processed_bbs.count(b) != 0)
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### already processed " << std::endl;
+#endif
+ continue;
+ }
+
+ const bool isBackedge = DT->dominates(b, subgraph_entry);
+ if (isBackedge) {
+ // This is a loop backedge. Do not find subgraphs across
+ // those.
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### a loop backedge, skipping" << std::endl;
+#endif
+ continue;
+ }
+ if (DT->dominates(dominator, b))
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### " << dominator->getName().str() << " dominates "
+ << b->getName().str() << std::endl;
+#endif
+ changed |= ReplicateJoinedSubgraphs(dominator, b, processed_bbs);
+ }
+ else
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### " << dominator->getName().str() << " does not dominate "
+ << b->getName().str() << " replicating " << std::endl;
+#endif
+ BasicBlock *replicated_subgraph_entry =
+ ReplicateSubgraph(b, f);
+ t->setSuccessor(i, replicated_subgraph_entry);
+ changed = true;
+ }
+
+ if (changed)
+ {
+ // We have modified the function. Possibly created new loops.
+ // Update analysis passes.
+ DT->runOnFunction(*f);
+ #ifdef LLVM_3_1
+ LI->getBase().Calculate(DT->getBase());
+ #else
+ LI->runOnFunction(*f);
+ #endif
+ }
+ }
+ processed_bbs.insert(subgraph_entry);
+ return changed;
+}
+
+// Removes phi elements for which there are no successors (anymore).
+bool
+BarrierTailReplication::CleanupPHIs(llvm::BasicBlock *BB)
+{
+
+ bool changed = false;
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### CleanupPHIs for BB:" << std::endl;
+ BB->dump();
+#endif
+
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; )
+ {
+ PHINode *PN = dyn_cast<PHINode>(BI);
+ if (PN == NULL) break;
+
+ bool PHIRemoved = false;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+ {
+ bool isSuccessor = false;
+ // find if the predecessor branches to this one (anymore)
+ for (unsigned s = 0,
+ se = PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors();
+ s < se; ++s) {
+ if (PN->getIncomingBlock(i)->getTerminator()->getSuccessor(s) == BB)
+ {
+ isSuccessor = true;
+ break;
+ }
+ }
+ if (!isSuccessor)
+ {
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "removing incoming value " << i << " from PHINode:" << std::endl;
+ PN->dump();
+#endif
+ PN->removeIncomingValue(i, true);
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "now:" << std::endl;
+ PN->dump();
+#endif
+ changed = true;
+ e--;
+ if (e == 0)
+ {
+ PHIRemoved = true;
+ break;
+ }
+ i = 0;
+ continue;
+ }
+ }
+ if (PHIRemoved)
+ BI = BB->begin();
+ else
+ BI++;
+ }
+ return changed;
+}
+
+BasicBlock *
+BarrierTailReplication::ReplicateSubgraph(BasicBlock *entry,
+ Function *f)
+{
+ // Find all basic blocks to replicate.
+ BasicBlockVector subgraph;
+ FindSubgraph(subgraph, entry);
+
+ // Replicate subgraph maintaining control flow.
+ BasicBlockVector v;
+
+ ValueToValueMapTy m;
+ ReplicateBasicBlocks(v, m, subgraph, f);
+ UpdateReferences(v, m);
+
+ // Return entry block of replicated subgraph.
+ return cast<BasicBlock>(m[entry]);
+}
+
+
+void
+BarrierTailReplication::FindSubgraph(BasicBlockVector &subgraph,
+ BasicBlock *entry)
+{
+ // The subgraph can have internal branches (join points)
+ // avoid replicating these parts multiple times within the
+ // same tail.
+ if (std::count(subgraph.begin(), subgraph.end(), entry) > 0)
+ return;
+
+ subgraph.push_back(entry);
+
+ const TerminatorInst *t = entry->getTerminator();
+ Loop *l = LI->getLoopFor(entry);
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *successor = t->getSuccessor(i);
+ const bool isBackedge = DT->dominates(successor, entry);
+ if (isBackedge) continue;
+ FindSubgraph(subgraph, successor);
+ }
+}
+
+
+void
+BarrierTailReplication::ReplicateBasicBlocks(BasicBlockVector &new_graph,
+ ValueToValueMapTy &reference_map,
+ BasicBlockVector &graph,
+ Function *f)
+{
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "### ReplicateBasicBlocks: " << std::endl;
+#endif
+ for (BasicBlockVector::const_iterator i = graph.begin(),
+ e = graph.end();
+ i != e; ++i) {
+ BasicBlock *b = *i;
+ BasicBlock *new_b = BasicBlock::Create(b->getContext(),
+ b->getName() + ".btr",
+ f);
+ reference_map.insert(std::make_pair(b, new_b));
+ new_graph.push_back(new_b);
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << "Replicated BB: " << new_b->getName().str() << std::endl;
+#endif
+
+ for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
+ i2 != e2; ++i2) {
+ Instruction *i = i2->clone();
+ reference_map.insert(std::make_pair(i2, i));
+ new_b->getInstList().push_back(i);
+ }
+
+ // Add predicates to PHINodes of basic blocks the replicated
+ // block jumps to (backedges).
+ TerminatorInst *t = new_b->getTerminator();
+ for (unsigned i = 0, e = t->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *successor = t->getSuccessor(i);
+ if (std::count(graph.begin(), graph.end(), successor) == 0) {
+ // Successor is not in the graph, possible backedge.
+ for (BasicBlock::iterator i = successor->begin(), e = successor->end();
+ i != e; ++i) {
+ PHINode *phi = dyn_cast<PHINode>(i);
+ if (phi == NULL)
+ break; // All PHINodes already checked.
+
+ // Get value for original incoming edge and add new predicate.
+ Value *v = phi->getIncomingValueForBlock(b);
+ Value *new_v = reference_map[v];
+ if (new_v == NULL) {
+ /* This case can happen at least when replicating a latch
+ block in a b-loop. The value produced might be from a common
+ path before the replicated part. Then just use the original value.*/
+ new_v = v;
+#if 0
+ std::cerr << "### could not find a replacement block for phi node ("
+ << b->getName().str() << ")" << std::endl;
+ phi->dump();
+ v->dump();
+ f->viewCFG();
+ assert (0);
+#endif
+ }
+ phi->addIncoming(new_v, new_b);
+ }
+ }
+ }
+ }
+
+#ifdef DEBUG_BARRIER_REPL
+ std::cerr << std::endl;
+#endif
+}
+
+
+void
+BarrierTailReplication::UpdateReferences(const BasicBlockVector &graph,
+ ValueToValueMapTy &reference_map)
+{
+ for (BasicBlockVector::const_iterator i = graph.begin(),
+ e = graph.end();
+ i != e; ++i) {
+ BasicBlock *b = *i;
+ for (BasicBlock::iterator i2 = b->begin(), e2 = b->end();
+ i2 != e2; ++i2) {
+ Instruction *i = i2;
+ RemapInstruction(i, reference_map,
+ RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+ }
+ }
+}
+
+
+static bool
+block_has_barrier(const BasicBlock *bb)
+{
+ for (BasicBlock::const_iterator i = bb->begin(), e = bb->end();
+ i != e; ++i) {
+ if (isa<Barrier>(i))
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/llvmopencl/BarrierTailReplication.h b/src/llvmopencl/BarrierTailReplication.h
new file mode 100644
index 0000000..7e3beb0
--- /dev/null
+++ b/src/llvmopencl/BarrierTailReplication.h
@@ -0,0 +1,85 @@
+// Header for BarrierTailReplication.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_BARRIER_TAIL_REPLICATION
+#define POCL_BARRIER_TAIL_REPLICATION
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+namespace pocl {
+ class Workgroup;
+
+ class BarrierTailReplication : public llvm::FunctionPass {
+
+ public:
+ static char ID;
+
+ BarrierTailReplication(): FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+ typedef std::set<llvm::BasicBlock *> BasicBlockSet;
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap;
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+
+ bool ProcessFunction(llvm::Function &F);
+ bool FindBarriersDFS(llvm::BasicBlock *bb,
+ BasicBlockSet &processed_bbs);
+ bool ReplicateJoinedSubgraphs(llvm::BasicBlock *dominator,
+ llvm::BasicBlock *subgraph_entry,
+ BasicBlockSet &processed_bbs);
+
+ llvm::BasicBlock* ReplicateSubgraph(llvm::BasicBlock *entry,
+ llvm::Function *f);
+ void FindSubgraph(BasicBlockVector &subgraph,
+ llvm::BasicBlock *entry);
+ void ReplicateBasicBlocks(BasicBlockVector &new_graph,
+ llvm::ValueToValueMapTy &reference_map,
+ BasicBlockVector &graph,
+ llvm::Function *f);
+ void UpdateReferences(const BasicBlockVector &graph,
+ llvm::ValueToValueMapTy &reference_map);
+
+ bool CleanupPHIs(llvm::BasicBlock *BB);
+
+ friend class pocl::Workgroup;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/BreakConstantGEPs.cpp b/src/llvmopencl/BreakConstantGEPs.cpp
new file mode 100644
index 0000000..a12aaaa
--- /dev/null
+++ b/src/llvmopencl/BreakConstantGEPs.cpp
@@ -0,0 +1,326 @@
+//===- BreakConstantGEPs.cpp - Change constant GEPs into GEP instructions - --//
+//
+// pocl note: This pass is taken from The SAFECode project with trivial modifications.
+// Automatic locals might cause constant GEPs which cause problems during
+// converting the locals to kernel function arguments for thread safety.
+//
+// The SAFECode Compiler
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass changes all GEP constant expressions into GEP instructions. This
+// permits the rest of SAFECode to put run-time checks on them if necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "break-constgeps"
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#endif
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/InstIterator.h"
+
+#include "BreakConstantGEPs.h"
+#include "Workgroup.h"
+
+#include <iostream>
+#include <map>
+#include <utility>
+
+// Identifier variable for the pass
+char BreakConstantGEPs::ID = 0;
+
+// Statistics
+STATISTIC (GEPChanges, "Number of Converted GEP Constant Expressions");
+STATISTIC (TotalChanges, "Number of Converted Constant Expressions");
+
+// Register the pass
+static RegisterPass<BreakConstantGEPs> P ("break-constgeps",
+ "Remove GEP Constant Expressions");
+
+//
+// Function: hasConstantGEP()
+//
+// Description:
+// This function determines whether the given value is a constant expression
+// that has a constant GEP expression embedded within it.
+//
+// Inputs:
+// V - The value to check.
+//
+// Return value:
+// NULL - This value is not a constant expression with a constant expression
+// GEP within it.
+// ~NULL - A pointer to the value casted into a ConstantExpr is returned.
+//
+static ConstantExpr *
+hasConstantGEP (Value * V) {
+ if (ConstantExpr * CE = dyn_cast<ConstantExpr>(V)) {
+ if (CE->getOpcode() == Instruction::GetElementPtr ||
+ CE->getOpcode() == Instruction::BitCast)
+ {
+ return CE;
+ } else {
+ for (unsigned index = 0; index < CE->getNumOperands(); ++index) {
+ if (hasConstantGEP (CE->getOperand(index)))
+ return CE;
+ }
+ }
+ }
+
+ return 0;
+}
+
+//
+// Function: convertGEP()
+//
+// Description:
+// Convert a GEP constant expression into a GEP instruction.
+//
+// Inputs:
+// CE - The GEP constant expression.
+// InsertPt - The instruction before which to insert the new GEP instruction.
+//
+// Return value:
+// A pointer to the new GEP instruction is returned.
+//
+static Instruction *
+convertGEP (ConstantExpr * CE, Instruction * InsertPt) {
+ //
+ // Create iterators to the indices of the constant expression.
+ //
+ std::vector<Value *> Indices;
+ for (unsigned index = 1; index < CE->getNumOperands(); ++index) {
+ Indices.push_back (CE->getOperand (index));
+ }
+
+ //
+ // Update the statistics.
+ //
+ ++GEPChanges;
+
+ //
+ // Make the new GEP instruction.
+ //
+ return (GetElementPtrInst::Create (CE->getOperand(0),
+ Indices,
+ CE->getName(),
+ InsertPt));
+}
+
+//
+// Function: convertExpression()
+//
+// Description:
+// Convert a constant expression into an instruction. This routine does *not*
+// perform any recursion, so the resulting instruction may have constant
+// expression operands.
+//
+static Instruction *
+convertExpression (ConstantExpr * CE, Instruction * InsertPt) {
+ //
+ // Convert this constant expression into a regular instruction.
+ //
+ Instruction * NewInst = 0;
+ switch (CE->getOpcode()) {
+ case Instruction::GetElementPtr: {
+ NewInst = convertGEP (CE, InsertPt);
+ break;
+ }
+
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Instruction::BinaryOps Op = (Instruction::BinaryOps)(CE->getOpcode());
+ NewInst = BinaryOperator::Create (Op,
+ CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast: {
+ Instruction::CastOps Op = (Instruction::CastOps)(CE->getOpcode());
+ NewInst = CastInst::Create (Op,
+ CE->getOperand(0),
+ CE->getType(),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction:: FCmp:
+ case Instruction:: ICmp: {
+ Instruction::OtherOps Op = (Instruction::OtherOps)(CE->getOpcode());
+ NewInst = CmpInst::Create (Op,
+ CE->getPredicate(),
+ CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getName(),
+ InsertPt);
+ break;
+ }
+
+ case Instruction:: Select:
+ NewInst = SelectInst::Create (CE->getOperand(0),
+ CE->getOperand(1),
+ CE->getOperand(2),
+ CE->getName(),
+ InsertPt);
+ break;
+
+ case Instruction:: ExtractElement:
+ case Instruction:: InsertElement:
+ case Instruction:: ShuffleVector:
+ case Instruction:: InsertValue:
+ default:
+ assert (0 && "Unhandled constant expression!\n");
+ break;
+ }
+
+ //
+ // Update the statistics.
+ //
+ ++TotalChanges;
+
+ return NewInst;
+}
+
+//
+// Method: runOnFunction()
+//
+// Description:
+// Entry point for this LLVM pass.
+//
+// Return value:
+// true - The function was modified.
+// false - The function was not modified.
+//
+bool
+BreakConstantGEPs::runOnFunction (Function & F) {
+
+ if (!pocl::Workgroup::isKernelToProcess(F)) return false;
+
+ bool modified = false;
+
+ // Worklist of values to check for constant GEP expressions
+ std::vector<Instruction *> Worklist;
+
+ //
+ // Initialize the worklist by finding all instructions that have one or more
+ // operands containing a constant GEP expression.
+ //
+ for (Function::iterator BB = F.begin(); BB != F.end(); ++BB) {
+ for (BasicBlock::iterator i = BB->begin(); i != BB->end(); ++i) {
+ //
+ // Scan through the operands of this instruction. If it is a constant
+ // expression GEP, insert an instruction GEP before the instruction.
+ //
+ Instruction * I = i;
+ for (unsigned index = 0; index < I->getNumOperands(); ++index) {
+ if (hasConstantGEP (I->getOperand(index))) {
+ Worklist.push_back (I);
+ }
+ }
+ }
+ }
+
+ //
+ // Determine whether we will modify anything.
+ //
+ if (Worklist.size()) modified = true;
+
+ //
+ // While the worklist is not empty, take an item from it, convert the
+ // operands into instructions if necessary, and determine if the newly
+ // added instructions need to be processed as well.
+ //
+ while (Worklist.size()) {
+ Instruction * I = Worklist.back();
+ Worklist.pop_back();
+
+ //
+ // Scan through the operands of this instruction and convert each into an
+ // instruction. Note that this works a little differently for phi
+ // instructions because the new instruction must be added to the
+ // appropriate predecessor block.
+ //
+ if (PHINode * PHI = dyn_cast<PHINode>(I)) {
+ for (unsigned index = 0; index < PHI->getNumIncomingValues(); ++index) {
+ //
+ // For PHI Nodes, if an operand is a constant expression with a GEP, we
+ // want to insert the new instructions in the predecessor basic block.
+ //
+ // Note: It seems that it's possible for a phi to have the same
+ // incoming basic block listed multiple times; this seems okay as long
+ // the same value is listed for the incoming block.
+ //
+ Instruction * InsertPt = PHI->getIncomingBlock(index)->getTerminator();
+ if (ConstantExpr * CE = hasConstantGEP (PHI->getIncomingValue(index))) {
+ Instruction * NewInst = convertExpression (CE, InsertPt);
+ for (unsigned i2 = index; i2 < PHI->getNumIncomingValues(); ++i2) {
+ if ((PHI->getIncomingBlock (i2)) == PHI->getIncomingBlock (index))
+ PHI->setIncomingValue (i2, NewInst);
+ }
+ Worklist.push_back (NewInst);
+ }
+ }
+ } else {
+ for (unsigned index = 0; index < I->getNumOperands(); ++index) {
+ //
+ // For other instructions, we want to insert instructions replacing
+ // constant expressions immediently before the instruction using the
+ // constant expression.
+ //
+ if (ConstantExpr * CE = hasConstantGEP (I->getOperand(index))) {
+ Instruction * NewInst = convertExpression (CE, I);
+ I->replaceUsesOfWith (CE, NewInst);
+ Worklist.push_back (NewInst);
+ }
+ }
+ }
+ }
+
+ return modified;
+}
+
+
diff --git a/src/llvmopencl/BreakConstantGEPs.h b/src/llvmopencl/BreakConstantGEPs.h
new file mode 100644
index 0000000..4cd86b2
--- /dev/null
+++ b/src/llvmopencl/BreakConstantGEPs.h
@@ -0,0 +1,57 @@
+//===- BreakConstantGEPs.h - Change constant GEPs into GEP instructions --- --//
+//
+// pocl note: This pass is taken from The SAFECode project with trivial modifications.
+// Automatic locals might cause constant GEPs which cause problems during
+// converting the locals to kernel function arguments for thread safety.
+//
+// The SAFECode Compiler
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass changes all GEP constant expressions into GEP instructions. This
+// permits the rest of SAFECode to put run-time checks on them if necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BREAKCONSTANTGEPS_H
+#define BREAKCONSTANTGEPS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+//
+// Pass: BreakConstantGEPs
+//
+// Description:
+// This pass modifies a function so that it uses GEP instructions instead of
+// GEP constant expressions.
+//
+struct BreakConstantGEPs : public FunctionPass {
+ private:
+ // Private methods
+
+ // Private variables
+
+ public:
+ static char ID;
+ BreakConstantGEPs() : FunctionPass(ID) {}
+ const char *getPassName() const {return "Remove Constant GEP Expressions";}
+ virtual bool runOnFunction (Function & F);
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ // This pass does not modify the control-flow graph of the function
+ AU.setPreservesCFG();
+ }
+};
+
+#endif
diff --git a/src/llvmopencl/CanonicalizeBarriers.cc b/src/llvmopencl/CanonicalizeBarriers.cc
new file mode 100644
index 0000000..409e264
--- /dev/null
+++ b/src/llvmopencl/CanonicalizeBarriers.cc
@@ -0,0 +1,214 @@
+// LLVM function pass to canonicalize barriers.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// 2012 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "CanonicalizeBarriers.h"
+#include "BarrierBlock.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <iostream>
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<CanonicalizeBarriers> X("barriers",
+ "Barrier canonicalization pass");
+}
+
+char CanonicalizeBarriers::ID = 0;
+
+void
+CanonicalizeBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+}
+
+bool
+CanonicalizeBarriers::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ BasicBlock *entry = &F.getEntryBlock();
+ if (!isa<BarrierBlock>(entry)) {
+ BasicBlock *effective_entry = SplitBlock(entry,
+ &(entry->front()),
+ this);
+ effective_entry->takeName(entry);
+ entry->setName("entry.barrier");
+ Barrier::Create(entry->getTerminator());
+ }
+
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ BasicBlock *b = i;
+ TerminatorInst *t = b->getTerminator();
+ if ((t->getNumSuccessors() == 0) && (!isa<BarrierBlock>(b))) {
+ /* In case the bb is already terminated with a barrier,
+ split before the barrier so we dot create an empty
+ parallel region.
+
+ This is because the assumptions of the other passes in the
+ compilation that are
+ a) exit node is a barrier block
+ b) there are no empty parallel regions (which would be formed
+ between the explicit barrier and the added one). */
+ BasicBlock *exit;
+ if (Barrier::endsWithBarrier(b))
+ exit = SplitBlock(b, t->getPrevNode(), this);
+ else
+ exit = SplitBlock(b, t, this);
+ exit->setName("exit.barrier");
+ Barrier::Create(t);
+ }
+ }
+
+ DT = getAnalysisIfAvailable<DominatorTree>();
+ LI = getAnalysisIfAvailable<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+
+ if (DT)
+ DT->verifyAnalysis();
+ if (LI)
+ LI->verifyAnalysis();
+
+ return changed;
+}
+
+
+// Canonicalize barriers: ensure all barriers are in a separate BB
+// containing only the barrier and the terminator, with just one
+// predecessor and one successor. This allows us to use
+// those BBs as markers only, they will not be replicated.
+bool
+CanonicalizeBarriers::ProcessFunction(Function &F)
+{
+ bool changed = false;
+
+ InstructionSet Barriers;
+
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ BasicBlock *b = i;
+ for (BasicBlock::iterator i = b->begin(), e = b->end();
+ i != e; ++i)
+ {
+ if (isa<Barrier>(i))
+ {
+ Barriers.insert(i);
+ }
+ }
+ }
+
+ // Finally add all the split points, now that we are done with the
+ // iterators.
+ for (InstructionSet::iterator i = Barriers.begin(), e = Barriers.end();
+ i != e; ++i) {
+ BasicBlock *b = (*i)->getParent();
+
+ // Split post barrier first cause it does not make the barrier
+ // to belong to another basic block.
+ TerminatorInst *t = b->getTerminator();
+ // if ((t->getNumSuccessors() > 1) ||
+ // (t->getPrevNode() != *i)) {
+ // Change: barriers with several successors are all right
+ // they just start several parallel regions. Simplifies
+ // loop handling.
+
+ const bool HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER =
+ t->getPrevNode() != *i;
+
+ if (HAS_NON_BRANCH_INSTRUCTIONS_AFTER_BARRIER) {
+ BasicBlock *new_b = SplitBlock(b, (*i)->getNextNode(), this);
+ new_b->setName(b->getName() + ".postbarrier");
+ changed = true;
+ }
+
+ BasicBlock *predecessor = b->getSinglePredecessor();
+ if (predecessor != NULL) {
+ TerminatorInst *pt = predecessor->getTerminator();
+ if ((pt->getNumSuccessors() == 1) &&
+ (&b->front() == (*i))) {
+ // Barrier is at the beginning of the BB,
+ // which has a single predecessor with just
+ // one successor (the barrier itself), thus
+ // no need to split before barrier.
+ continue;
+ }
+ }
+ if ((b == &(b->getParent()->getEntryBlock())) &&
+ (&b->front() == (*i)))
+ continue;
+
+ // If no instructions before barrier, do not split
+ // (allow multiple predecessors, eases loop handling).
+ // if (&b->front() == (*i))
+ // continue;
+ BasicBlock *new_b = SplitBlock(b, *i, this);
+ new_b->takeName(b);
+ b->setName(new_b->getName() + ".prebarrier");
+ changed = true;
+ }
+
+ /* Prune empty regions. That is, if there are two successive
+ barriers, remove the other one. */
+ bool emptyRegionDeleted = false;
+ do {
+ emptyRegionDeleted = false;
+ for (Function::iterator i = F.begin(), e = F.end();
+ i != e; ++i)
+ {
+ BasicBlock *b = i;
+ llvm::TerminatorInst *t = b->getTerminator();
+ if (!Barrier::endsWithBarrier(b) || t->getNumSuccessors() != 1) continue;
+
+ BasicBlock *successor = t->getSuccessor(0);
+
+ if (Barrier::hasOnlyBarrier(successor) &&
+ successor->getSinglePredecessor() == b &&
+ successor->getTerminator()->getNumSuccessors() == 1)
+ {
+ b->getTerminator()->setSuccessor(0, successor->getTerminator()->getSuccessor(0));
+ successor->replaceAllUsesWith(b);
+ successor->eraseFromParent();
+ emptyRegionDeleted = true;
+ changed = true;
+ break;
+ }
+ }
+ } while (emptyRegionDeleted);
+
+
+ return changed;
+}
diff --git a/src/llvmopencl/CanonicalizeBarriers.h b/src/llvmopencl/CanonicalizeBarriers.h
new file mode 100644
index 0000000..047db1d
--- /dev/null
+++ b/src/llvmopencl/CanonicalizeBarriers.h
@@ -0,0 +1,56 @@
+// Header for CanonicalizeBarriers.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Pass.h"
+#include <set>
+
+namespace pocl {
+ class Workgroup;
+
+ class CanonicalizeBarriers : public llvm::FunctionPass {
+
+ public:
+ static char ID;
+
+ CanonicalizeBarriers() : FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+ typedef std::set<llvm::Instruction *> InstructionSet;
+
+ llvm::LoopInfo *LI;
+ llvm::DominatorTree *DT;
+
+ bool ProcessFunction(llvm::Function &F);
+
+ friend class pocl::Workgroup;
+ };
+}
diff --git a/src/llvmopencl/Flatten.cc b/src/llvmopencl/Flatten.cc
new file mode 100644
index 0000000..2e01f2a
--- /dev/null
+++ b/src/llvmopencl/Flatten.cc
@@ -0,0 +1,158 @@
+// LLVM module pass to inline required functions (those accessing
+// per-workgroup variables) into the kernel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Flatten.h"
+using namespace pocl;
+
+extern cl::opt<std::string> KernelName;
+
+char Flatten::ID = 0;
+static RegisterPass<Flatten> X("flatten", "Kernel function flattening pass");
+
+static const char *workgroup_variables[] = {
+ "_local_id_x", "_local_id_y", "_local_id_z",
+ "_local_size_x", "_local_size_y", "_local_size_z",
+ "_work_dim",
+ "_num_groups_x", "_num_groups_y", "_num_groups_z",
+ "_group_id_x", "_group_id_y", "_group_id_z",
+ "_global_offset_x", "_global_offset_y", "_global_offset_z",
+ NULL};
+
+//#define DEBUG_FLATTEN
+
+#define INLINE_ALL_NON_KERNEL
+
+#ifdef INLINE_ALL_NON_KERNEL
+
+bool
+Flatten::runOnModule(Module &M)
+{
+ bool changed = false;
+ for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i)
+ {
+ llvm::Function *f = i;
+ if (f->isDeclaration()) continue;
+ if (KernelName == f->getName() ||
+ (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f)))
+ {
+#ifdef LLVM_3_1
+ f->removeFnAttr(Attribute::AlwaysInline);
+ f->addFnAttr(Attribute::NoInline);
+#elif defined LLVM_3_2
+ AttrBuilder b;
+ f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::AlwaysInline)));
+ f->addFnAttr(Attributes::NoInline);
+#else
+ AttributeSet attrs;
+ f->removeAttributes(
+ AttributeSet::FunctionIndex,
+ attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::AlwaysInline));
+
+ f->addFnAttr(Attribute::NoInline);
+#endif
+
+ f->setLinkage(llvm::GlobalValue::ExternalLinkage);
+ changed = true;
+#ifdef DEBUG_FLATTEN
+ std::cerr << "### NoInline for " << f->getName().str() << std::endl;
+#endif
+ }
+ else
+ {
+#ifdef LLVM_3_1
+ f->removeFnAttr(Attribute::NoInline);
+ f->addFnAttr(Attribute::AlwaysInline);
+#elif defined LLVM_3_2
+ AttrBuilder b;
+ f->removeFnAttr(Attributes::get(M.getContext(), b.addAttribute(Attributes::NoInline)));
+ f->addFnAttr(Attributes::AlwaysInline);
+#else
+ AttributeSet attrs;
+ f->removeAttributes(
+ AttributeSet::FunctionIndex,
+ attrs.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoInline));
+ f->addFnAttr(Attribute::AlwaysInline);
+#endif
+
+ f->setLinkage(llvm::GlobalValue::InternalLinkage);
+ changed = true;
+#ifdef DEBUG_FLATTEN
+ std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl;
+#endif
+ }
+ }
+ return changed;
+}
+
+#else
+
+bool
+Flatten::runOnModule(Module &M)
+{
+ SmallPtrSet<Function *, 8> functions_to_inline;
+ SmallVector<Value *, 8> pending;
+
+ const char **s = workgroup_variables;
+ while (*s != NULL) {
+ GlobalVariable *gv = M.getGlobalVariable(*s);
+ if (gv != NULL)
+ pending.push_back(gv);
+
+ ++s;
+ }
+
+ while (!pending.empty()) {
+ Value *v = pending.back();
+ pending.pop_back();
+
+ for (Value::use_iterator i = v->use_begin(), e = v->use_end();
+ i != e; ++i) {
+ if (Instruction *ci = dyn_cast<Instruction>(*i)) {
+ // Prevent infinite looping on recursive functions
+ // (though OpenCL does not allow this?)
+ Function *f = ci->getParent()->getParent();;
+ assert((f != NULL) &&
+ "Per-workgroup global variable used on function with no parent!");
+ if (functions_to_inline.count(f))
+ continue;
+
+ functions_to_inline.insert(f);
+ pending.push_back(f);
+ }
+ }
+ }
+
+ for (SmallPtrSet<Function *, 8>::iterator i = functions_to_inline.begin(),
+ e = functions_to_inline.end();
+ i != e; ++i) {
+ (*i)->removeFnAttr(Attribute::NoInline);
+ (*i)->addFnAttr(Attribute::AlwaysInline);
+ }
+
+ return true;
+}
+
+#endif
+
+
diff --git a/src/llvmopencl/Flatten.h b/src/llvmopencl/Flatten.h
new file mode 100644
index 0000000..df3a174
--- /dev/null
+++ b/src/llvmopencl/Flatten.h
@@ -0,0 +1,51 @@
+// LLVM module pass to inline required functions (those accessing
+// per-workgroup variables) into the kernel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <iostream>
+#include <string>
+#include "Workgroup.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Pass.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+
+using namespace llvm;
+
+namespace pocl {
+ class Flatten : public ModulePass {
+
+ public:
+ static char ID;
+ Flatten() : ModulePass(ID) {}
+
+ virtual bool runOnModule(Module &M);
+ };
+
+}
+
diff --git a/src/llvmopencl/GenerateHeader.cc b/src/llvmopencl/GenerateHeader.cc
new file mode 100644
index 0000000..55a5bbe
--- /dev/null
+++ b/src/llvmopencl/GenerateHeader.cc
@@ -0,0 +1,336 @@
+// LLVM module pass to get information from kernel functions.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "pocl.h"
+#include "Workgroup.h"
+#include "llvm/Pass.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#ifdef LLVM_3_1
+#include "llvm/Target/TargetData.h"
+#elif defined LLVM_3_2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Argument.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "LLVMUtils.h"
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+
+cl::opt<string>
+Header("header",
+ cl::desc("Output header file with kernel description macros"),
+ cl::value_desc("header"));
+
+namespace {
+ class GenerateHeader : public ModulePass {
+
+ public:
+ static char ID;
+ GenerateHeader() : ModulePass(ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+ virtual bool runOnModule(Module &M);
+
+ private:
+ void ProcessPointers(Function *F,
+ raw_fd_ostream &out);
+ void ProcessReqdWGSize(Function *F,
+ raw_fd_ostream &out);
+ Function *ProcessAutomaticLocals(Function *F,
+ raw_fd_ostream &out);
+ };
+}
+
+char GenerateHeader::ID = 0;
+static RegisterPass<GenerateHeader> X("generate-header",
+ "Kernel information header creation pass");
+
+void
+GenerateHeader::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DataLayout>();
+}
+
+bool
+GenerateHeader::runOnModule(Module &M)
+{
+ bool changed = false;
+
+ // store the new and old kernel pairs in order to regenerate
+ // all the metadata that used to point to the unmodified
+ // kernels
+ FunctionMapping kernels;
+
+ string ErrorInfo;
+ raw_fd_ostream out(Header.c_str(), ErrorInfo, raw_fd_ostream::F_Append);
+
+ for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) {
+ if (!Workgroup::isKernelToProcess(*mi))
+ continue;
+
+ Function *F = mi;
+
+ ProcessPointers(F, out);
+ ProcessReqdWGSize(F, out);
+
+ Function *new_kernel = ProcessAutomaticLocals(F, out);
+ if (new_kernel != F)
+ changed = true;
+ kernels[F] = new_kernel;
+ }
+
+ if (changed)
+ {
+ regenerate_kernel_metadata(M, kernels);
+
+ /* Delete the old kernels. */
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end(); i != e; ++i)
+ {
+ Function *old_kernel = (*i).first;
+ Function *new_kernel = (*i).second;
+ if (old_kernel == new_kernel) continue;
+ old_kernel->eraseFromParent();
+ }
+ }
+ return changed;
+}
+
+#include <iostream>
+
+void
+GenerateHeader::ProcessReqdWGSize(Function *F,
+ raw_fd_ostream &out)
+{
+
+ unsigned LocalSizeX = 0, LocalSizeY = 0, LocalSizeZ = 0;
+
+ llvm::NamedMDNode *size_info = F->getParent()->getNamedMetadata("opencl.kernel_wg_size_info");
+ if (size_info) {
+ for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
+ llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
+ if (KernelSizeInfo->getOperand(0) == F) {
+ LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
+ LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
+ LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
+ }
+ }
+ }
+
+ out << "#define _" << F->getName() << "_REQD_WG_SIZE {"
+ << LocalSizeX << ", "
+ << LocalSizeY << ", "
+ << LocalSizeZ << "}\n";
+}
+
+
+void
+GenerateHeader::ProcessPointers(Function *F,
+ raw_fd_ostream &out)
+{
+ int num_args = F->getFunctionType()->getNumParams();
+
+ out << "#define _" << F->getName() << "_NUM_ARGS " << num_args << '\n';
+
+ bool is_pointer[num_args];
+ bool is_local[num_args];
+ bool is_image[num_args];
+ bool is_sampler[num_args];
+
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(),
+ ee = F->arg_end();
+ ii != ee; ++ii) {
+ Type *t = ii->getType();
+
+ is_image[i] = false;
+ is_sampler[i] = false;
+
+ const PointerType *p = dyn_cast<PointerType>(t);
+ if (p && !ii->hasByValAttr()) {
+ is_pointer[i] = true;
+ // index 0 is for function attributes, parameters start at 1.
+ if (p->getAddressSpace() == POCL_ADDRESS_SPACE_GLOBAL ||
+ p->getAddressSpace() == POCL_ADDRESS_SPACE_CONSTANT)
+ is_local[i] = false;
+ else
+ is_local[i] = true;
+ } else {
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+
+ if (t->isPointerTy()) {
+ if (t->getPointerElementType()->isStructTy()) {
+ string name = t->getPointerElementType()->getStructName().str();
+ if (name == "struct.image2d_t_") { // TODO image3d?
+ is_image[i] = true;
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+ if (name == "struct.sampler_t_") {
+ is_sampler[i] = true;
+ is_pointer[i] = false;
+ is_local[i] = false;
+ }
+ }
+ }
+
+ ++i;
+ }
+
+ out << "#define _" << F->getName() << "_ARG_IS_POINTER {";
+ if (num_args != 0) {
+ out << is_pointer[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_pointer[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_LOCAL {";
+ if (num_args != 0) {
+ out << is_local[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_local[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_IMAGE {";
+ if (num_args != 0) {
+ out << is_image[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_image[i];
+ }
+ out << "}\n";
+
+ out << "#define _" << F->getName() << "_ARG_IS_SAMPLER {";
+ if (num_args != 0) {
+ out << is_sampler[0];
+ for (i = 1; i < num_args; ++i)
+ out << ", " << is_sampler[i];
+ }
+ out << "}\n";
+}
+
+
+Function *
+GenerateHeader::ProcessAutomaticLocals(Function *F,
+ raw_fd_ostream &out)
+{
+ Module *M = F->getParent();
+ DataLayout &TD = getAnalysis<DataLayout>();
+
+ SmallVector<GlobalVariable *, 8> locals;
+
+ SmallVector<Type *, 8> parameters;
+ for (Function::const_arg_iterator i = F->arg_begin(),
+ e = F->arg_end();
+ i != e; ++i)
+ parameters.push_back(i->getType());
+
+ for (Module::global_iterator i = M->global_begin(),
+ e = M->global_end();
+ i != e; ++i) {
+ std::string funcName = "";
+ funcName = F->getName().str();
+ if (i->getName().startswith(funcName + ".")) {
+ // Additional checks might be needed here. For now
+ // we assume any global starting with kernel name
+ // is declaring a local variable.
+ locals.push_back(i);
+ // Add the parameters to the end of the function parameter list.
+ parameters.push_back(i->getType());
+ }
+ }
+
+ out << "#define _" << F->getName() << "_NUM_LOCALS "<< locals.size() << "\n";
+ out << "#define _" << F->getName() << "_LOCAL_SIZE {";
+ if (!locals.empty()) {
+ out << TD.getTypeAllocSize(locals[0]->getInitializer()->getType());
+ for (unsigned i = 1; i < locals.size(); ++i)
+ out << ", " << TD.getTypeAllocSize(locals[i]->getInitializer()->getType());
+ }
+ out << "}\n";
+
+ if (locals.empty()) {
+ // This kernel fingerprint has not changed.
+ return F;
+ }
+
+ // Create the new function.
+ FunctionType *ft = FunctionType::get(F->getReturnType(),
+ parameters,
+ F->isVarArg());
+ Function *new_kernel = Function::Create(ft,
+ F->getLinkage(),
+ "",
+ M);
+ new_kernel->takeName(F);
+
+ ValueToValueMapTy vv;
+ Function::arg_iterator j = new_kernel->arg_begin();
+ for (Function::const_arg_iterator i = F->arg_begin(),
+ e = F->arg_end();
+ i != e; ++i) {
+ j->setName(i->getName());
+ vv[i] = j;
+ ++j;
+ }
+
+ for (int i = 0; j != new_kernel->arg_end(); ++i, ++j) {
+ j->setName("_local" + Twine(i));
+ vv[locals[i]] = j;
+ }
+
+ SmallVector<ReturnInst *, 1> ri;
+ CloneFunctionInto(new_kernel, F, vv, false, ri);
+
+ return new_kernel;
+}
+
diff --git a/src/llvmopencl/ImplicitLoopBarriers.cc b/src/llvmopencl/ImplicitLoopBarriers.cc
new file mode 100644
index 0000000..66dcdb3
--- /dev/null
+++ b/src/llvmopencl/ImplicitLoopBarriers.cc
@@ -0,0 +1,178 @@
+// LLVM function pass that adds implicit barriers to loops if it sees
+// beneficial.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include "ImplicitLoopBarriers.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+
+#include "VariableUniformityAnalysis.h"
+
+#include <iostream>
+
+//#define DEBUG_ILOOP_BARRIERS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<ImplicitLoopBarriers> X("implicit-loop-barriers",
+ "Adds implicit barriers to loops");
+}
+
+char ImplicitLoopBarriers::ID = 0;
+
+void
+ImplicitLoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+ AU.addRequired<VariableUniformityAnalysis>();
+ AU.addPreserved<VariableUniformityAnalysis>();
+}
+
+bool
+ImplicitLoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM)
+{
+ if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent()))
+ return false;
+
+ return ProcessLoop(L, LPM);
+}
+
+
+/**
+ * Adds a barrier to the first BB of each loop.
+ *
+ * Note: it's not safe to do this in case the loop is not executed
+ * by all work items. Therefore this is not enabled by default.
+ */
+bool
+ImplicitLoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM)
+{
+
+ bool isBLoop = false;
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && !isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+ isBLoop = true;
+ break;
+ }
+ }
+ }
+ if (isBLoop) return false;
+
+ return AddInnerLoopBarrier(L, LPM);
+}
+
+/**
+ * Adds a barrier to the beginning of the loop body to force its treatment
+ * similarly to a loop with work-group barriers.
+ *
+ * This allows parallelizing work-items across the work-group per kernel
+ * for-loop iteration, potentially leading to easier horizontal vectorization.
+ * The idea is similar to loop switching where the work-item loop is
+ * switched with the kernel for-loop.
+ *
+ * We need to make sure it is legal to add the barrier, though. The
+ * OpenCL barrier semantics require either all or none of the WIs to
+ * reach the barrier at each iteration. This is satisfied at least when
+ *
+ * a) loop exit condition does not depend on the WI and
+ * b) all or none of the WIs always enter the loop
+ */
+bool
+ImplicitLoopBarriers::AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM) {
+
+ /* Only add barriers to the innermost loops. */
+
+ if (L->getSubLoops().size() > 0)
+ return false;
+
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### trying to add a loop barrier to force horizontal parallelization"
+ << std::endl;
+#endif
+
+ BasicBlock *brexit = L->getExitingBlock();
+ if (brexit == NULL) return false; /* Multiple exit points */
+
+ llvm::BasicBlock *loopEntry = L->getHeader();
+ if (loopEntry == NULL) return false; /* Multiple entries blocks? */
+
+ llvm::Function *f = brexit->getParent();
+
+ VariableUniformityAnalysis &VUA =
+ getAnalysis<VariableUniformityAnalysis>();
+
+ /* Check if the whole loop construct is executed by all or none of the
+ work-items. */
+ if (!VUA.isUniform(f, loopEntry)) {
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### the loop is not uniform because loop entry '"
+ << loopEntry->getName().str() << "' is not uniform" << std::endl;
+
+#endif
+ return false;
+ }
+
+ /* Check the branch condition predicate. If it is uniform, we know the loop
+ is executed the same number of times for all WIs. */
+ llvm::BranchInst *br = dyn_cast<llvm::BranchInst>(brexit->getTerminator());
+ if (br && br->isConditional() &&
+ VUA.isUniform(f, br->getCondition())) {
+
+ Barrier::Create(brexit->getTerminator());
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### added an inner-loop barrier to the loop" << std::endl << std::endl;
+#endif
+ return true;
+ } else {
+#ifdef DEBUG_ILOOP_BARRIERS
+ if (br && br->isConditional() && !VUA.isUniform(f, br->getCondition())) {
+ std::cerr << "### loop condition not uniform" << std::endl;
+ br->getCondition()->dump();
+ }
+#endif
+
+ }
+
+#ifdef DEBUG_ILOOP_BARRIERS
+ std::cerr << "### cannot add an inner-loop barrier to the loop" << std::endl << std::endl;
+#endif
+
+ return false;
+}
diff --git a/src/llvmopencl/ImplicitLoopBarriers.h b/src/llvmopencl/ImplicitLoopBarriers.h
new file mode 100644
index 0000000..e31a134
--- /dev/null
+++ b/src/llvmopencl/ImplicitLoopBarriers.h
@@ -0,0 +1,44 @@
+// Header for ImplicitLoopBarriers loop pass.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "llvm/Analysis/LoopPass.h"
+#include <set>
+
+namespace pocl {
+ class ImplicitLoopBarriers : public llvm::LoopPass {
+
+ public:
+ static char ID;
+
+ ImplicitLoopBarriers() : LoopPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ private:
+ llvm::DominatorTree *DT;
+
+ bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+ bool AddInnerLoopBarrier(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ };
+}
diff --git a/src/llvmopencl/IsolateRegions.cc b/src/llvmopencl/IsolateRegions.cc
new file mode 100644
index 0000000..b370aa4
--- /dev/null
+++ b/src/llvmopencl/IsolateRegions.cc
@@ -0,0 +1,175 @@
+// Header for IsolateRegions RegionPass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "IsolateRegions.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "config.h"
+
+#include <iostream>
+
+//#define DEBUG_ISOLATE_REGIONS
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<IsolateRegions> X("isolate-regions",
+ "Single-Entry Single-Exit region isolation pass.");
+}
+
+char IsolateRegions::ID = 0;
+
+void
+IsolateRegions::getAnalysisUsage(AnalysisUsage &AU) const
+{
+}
+
+/* Ensure Single-Entry Single-Exit Regions are isolated from the
+ exit node so they won't get split illegally with tail replication.
+
+ This might happen in case an if .. else .. structure is just
+ before an exit from kernel. Both branches are split even though
+ we would like to replicate the structure as a whole to retain
+ semantics. This adds dummy basic blocks to all Regions just for
+ clarity. Cleanup with -simplifycfg.
+
+ TODO: Also add a dummy BB in case the Region starts with a
+ barrier. Such a Region might not get optimally replicated and
+ can lead to problematic cases. E.g.:
+
+ digraph G {
+ BAR1 -> A;
+ A -> X;
+ BAR1 -> X;
+ X -> BAR2;
+ }
+
+ (draw with "dot -Tpng -o graph.png" + copy paste the above)
+
+ Here you have a structure which should be replicated fully but
+ it won't as the Region starts with a barrier at a split point
+ BB, thus it tries to replicate both of the branches which lead
+ to interesting errors and is not supported. Another option would
+ be to tail replicate both of the branches, but currently tail
+ replication is done only starting from the exit nodes.
+
+ IsolateRegions "normalizes" the graph to:
+
+ digraph G {
+ BAR1 -> r_entry;
+ r_entry -> A;
+ A -> X;
+ r_entry -> X;
+ X -> BAR2;
+ }
+
+
+*/
+bool
+IsolateRegions::runOnRegion(Region *R, llvm::RGPassManager&)
+{
+ llvm::BasicBlock *exit = R->getExit();
+ if (exit == NULL) return false;
+
+#ifdef DEBUG_ISOLATE_REGIONS
+ std::cerr << "### processing region:" << std::endl;
+ R->dump();
+ std::cerr << "### exit block:" << std::endl;
+ exit->dump();
+#endif
+ bool isFunctionExit = exit->getTerminator()->getNumSuccessors() == 0;
+
+ bool changed = false;
+
+ if (Barrier::hasBarrier(exit) || isFunctionExit)
+ {
+ addDummyBefore(R, exit);
+ changed = true;
+ }
+
+ llvm::BasicBlock *entry = R->getEntry();
+ if (entry == NULL) return changed;
+
+ bool isFunctionEntry = &entry->getParent()->getEntryBlock() == entry;
+
+ if (Barrier::hasBarrier(entry) || isFunctionEntry)
+ {
+ addDummyAfter(R, entry);
+ changed = true;
+ }
+
+ return changed;
+}
+
+
+/**
+ * Adds a dummy node after the given basic block.
+ */
+void
+IsolateRegions::addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb)
+{
+ std::vector< llvm::BasicBlock* > regionSuccs;
+
+ for (llvm::succ_iterator i = succ_begin(bb), e = succ_end(bb);
+ i != e; ++i) {
+ llvm::BasicBlock* succ = *i;
+ if (R->contains(succ))
+ regionSuccs.push_back(succ);
+ }
+ llvm::BasicBlock* newEntry =
+ SplitBlock(bb, bb->getTerminator(), this);
+ newEntry->setName(bb->getName() + ".r_entry");
+ R->replaceEntry(newEntry);
+
+}
+
+/**
+ * Adds a dummy node before the given basic block.
+ *
+ * The edges going in to the original BB are moved to go
+ * in to the dummy BB in case the source BB is inside the
+ * same region.
+ */
+void
+IsolateRegions::addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb)
+{
+ std::vector< llvm::BasicBlock* > regionPreds;
+
+ for (pred_iterator i = pred_begin(bb), e = pred_end(bb);
+ i != e; ++i) {
+ llvm::BasicBlock* pred = *i;
+ if (R->contains(pred))
+ regionPreds.push_back(pred);
+ }
+#ifdef LLVM_3_0
+ llvm::BasicBlock* newExit =
+ SplitBlockPredecessors
+ (bb, &regionPreds[0], regionPreds.size(), ".r_exit", this);
+#else
+ llvm::BasicBlock* newExit =
+ SplitBlockPredecessors(bb, regionPreds, ".r_exit", this);
+#endif
+ R->replaceExit(newExit);
+}
diff --git a/src/llvmopencl/IsolateRegions.h b/src/llvmopencl/IsolateRegions.h
new file mode 100644
index 0000000..62f6a29
--- /dev/null
+++ b/src/llvmopencl/IsolateRegions.h
@@ -0,0 +1,44 @@
+// Header for IsolateRegions RegionPass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_ISOLATE_REGIONS_H
+#define POCL_ISOLATE_REGIONS_H
+
+#include "llvm/Analysis/RegionPass.h"
+
+namespace pocl {
+
+ class IsolateRegions : public llvm::RegionPass {
+ public:
+ static char ID;
+
+ IsolateRegions() : RegionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnRegion(llvm::Region *R, llvm::RGPassManager&);
+ void addDummyAfter(llvm::Region *R, llvm::BasicBlock *bb);
+ void addDummyBefore(llvm::Region *R, llvm::BasicBlock *bb);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Kernel.cc b/src/llvmopencl/Kernel.cc
new file mode 100644
index 0000000..03e08b8
--- /dev/null
+++ b/src/llvmopencl/Kernel.cc
@@ -0,0 +1,297 @@
+// Class for kernels, llvm::Functions that represent OpenCL C kernels.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Kernel.h"
+#include "Barrier.h"
+#include <iostream>
+
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif
+
+//#define DEBUG_PR_CREATION
+
+using namespace llvm;
+using namespace pocl;
+
+static void add_predecessors(SmallVectorImpl<BasicBlock *> &v,
+ BasicBlock *b);
+static bool verify_no_barriers(const BasicBlock *B);
+
+void
+Kernel::getExitBlocks(SmallVectorImpl<BarrierBlock *> &B)
+{
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ const TerminatorInst *t = i->getTerminator();
+ if (t->getNumSuccessors() == 0) {
+ // All exits must be barrier blocks.
+ B.push_back(cast<BarrierBlock>(i));
+ }
+ }
+}
+
+ParallelRegion *
+Kernel::createParallelRegionBefore(BarrierBlock *B)
+{
+ SmallVector<BasicBlock *, 4> pending_blocks;
+ SmallPtrSet<BasicBlock *, 8> blocks_in_region;
+ BarrierBlock *region_entry_barrier = NULL;
+ llvm::BasicBlock *entry = NULL;
+ llvm::BasicBlock *exit = B->getSinglePredecessor();
+ add_predecessors(pending_blocks, B);
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "createParallelRegionBefore " << B->getName().str() << std::endl;
+#endif
+
+ while (!pending_blocks.empty()) {
+ BasicBlock *current = pending_blocks.back();
+ pending_blocks.pop_back();
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "considering " << current->getName().str() << std::endl;
+#endif
+
+ // avoid infinite recursion of loops
+ if (blocks_in_region.count(current) != 0)
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "already in the region!" << std::endl;
+#endif
+ continue;
+ }
+
+ // If we reach another barrier this must be the
+ // parallel region entry.
+ if (isa<BarrierBlock>(current)) {
+ if (region_entry_barrier == NULL)
+ region_entry_barrier = cast<BarrierBlock>(current);
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "### it's a barrier!" << std::endl;
+#endif
+ continue;
+ }
+
+
+ if (!verify_no_barriers(current))
+ {
+ assert(verify_no_barriers(current) &&
+ "Barrier found in a non-barrier block! (forgot barrier canonicalization?)");
+ }
+
+#ifdef DEBUG_PR_CREATION
+ std::cerr << "added it to the region" << std::endl;
+#endif
+ // Non-barrier block, this must be on the region.
+ blocks_in_region.insert(current);
+
+ // Add predecessors to pending queue.
+ add_predecessors(pending_blocks, current);
+ }
+
+ if (blocks_in_region.empty())
+ return NULL;
+
+ // Find the entry node.
+ assert (region_entry_barrier != NULL);
+ for (unsigned suc = 0, num = region_entry_barrier->getTerminator()->getNumSuccessors();
+ suc < num; ++suc)
+ {
+ llvm::BasicBlock *entryCandidate =
+ region_entry_barrier->getTerminator()->getSuccessor(suc);
+ if (blocks_in_region.count(entryCandidate) == 0)
+ continue;
+ entry = entryCandidate;
+ break;
+ }
+ assert (blocks_in_region.count(entry) != 0);
+
+ // We got all the blocks in a region, create it.
+ return ParallelRegion::Create(blocks_in_region, entry, exit);
+}
+
+static void
+add_predecessors(SmallVectorImpl<BasicBlock *> &v, BasicBlock *b)
+{
+ for (pred_iterator i = pred_begin(b), e = pred_end(b);
+ i != e; ++i) {
+ if ((isa<BarrierBlock> (*i)) && isa<BarrierBlock> (b)) {
+ // Ignore barrier-to-barrier edges * Why? --Pekka
+ add_predecessors(v, *i);
+ continue;
+ }
+ v.push_back(*i);
+ }
+}
+
+static bool
+verify_no_barriers(const BasicBlock *B)
+{
+ for (BasicBlock::const_iterator i = B->begin(), e = B->end(); i != e; ++i) {
+ if (isa<Barrier>(i))
+ return false;
+ }
+
+ return true;
+}
+
+ParallelRegion::ParallelRegionVector *
+Kernel::getParallelRegions(llvm::LoopInfo *LI) {
+ ParallelRegion::ParallelRegionVector *parallel_regions =
+ new ParallelRegion::ParallelRegionVector;
+
+ SmallVector<BarrierBlock *, 4> exit_blocks;
+ getExitBlocks(exit_blocks);
+
+ // We need to keep track of traversed barriers to detect back edges.
+ SmallPtrSet<BarrierBlock *, 8> found_barriers;
+
+ // First find all the ParallelRegions in the Function.
+ while (!exit_blocks.empty()) {
+
+ // We start on an exit block and process the parallel regions upwards
+ // (finding an execution trace).
+ BarrierBlock *exit = exit_blocks.back();
+ exit_blocks.pop_back();
+
+ while (ParallelRegion *PR = createParallelRegionBefore(exit)) {
+ assert(PR != NULL && !PR->empty() &&
+ "Empty parallel region in kernel (contiguous barriers)!");
+
+ found_barriers.insert(exit);
+ exit = NULL;
+ parallel_regions->push_back(PR);
+ BasicBlock *entry = PR->entryBB();
+ int found_predecessors = 0;
+ BarrierBlock *loop_barrier = NULL;
+ for (pred_iterator i = pred_begin(entry), e = pred_end(entry);
+ i != e; ++i) {
+ BarrierBlock *barrier = cast<BarrierBlock> (*i);
+ if (!found_barriers.count(barrier)) {
+ /* If this is a loop header block we might have edges from two
+ unprocessed barriers. The one inside the loop (coming from a
+ computation block after a branch block) should be processed
+ first. */
+ std::string bbName = "";
+ const bool IS_IN_THE_SAME_LOOP =
+ LI->getLoopFor(barrier) != NULL &&
+ LI->getLoopFor(entry) != NULL &&
+ LI->getLoopFor(entry) == LI->getLoopFor(barrier);
+
+ if (IS_IN_THE_SAME_LOOP)
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### found a barrier inside the loop:" << std::endl;
+ std::cout << barrier->getName().str() << std::endl;
+#endif
+ if (loop_barrier != NULL) {
+ // there can be multiple latches and each have their barrier,
+ // save the previously found inner loop barrier
+ exit_blocks.push_back(loop_barrier);
+ }
+ loop_barrier = barrier;
+ }
+ else
+ {
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### found a barrier:" << std::endl;
+ std::cout << barrier->getName().str() << std::endl;
+#endif
+ exit = barrier;
+ }
+ ++found_predecessors;
+ }
+ }
+
+ if (loop_barrier != NULL)
+ {
+ /* The secondary barrier to process in case it was a loop
+ header. Push it for later processing. */
+ if (exit != NULL)
+ exit_blocks.push_back(exit);
+ /* always process the inner loop regions first */
+ if (!found_barriers.count(loop_barrier))
+ exit = loop_barrier;
+ }
+
+#ifdef DEBUG_PR_CREATION
+ std::cout << "### created a ParallelRegion:" << std::endl;
+ PR->dumpNames();
+ std::cout << std::endl;
+#endif
+
+ if (found_predecessors == 0)
+ {
+ /* This path has been traversed and we encountered no more
+ unprocessed regions. It means we have either traversed all
+ paths from the exit or have transformed a loop and thus
+ encountered only a barrier that was seen (and thus
+ processed) before. */
+ break;
+ }
+ assert ((exit != NULL) && "Parallel region without entry barrier!");
+ }
+ }
+ return parallel_regions;
+
+}
+
+void
+Kernel::addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ) {
+
+ IRBuilder<> builder(getEntryBlock().getFirstNonPHI());
+
+ GlobalVariable *gv;
+
+ llvm::Module* M = getParent();
+
+ int size_t_width = 32;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ IntegerType::get(M->getContext(), 32),
+ /*Params=*/ IntegerType::get(M->getContext(), 32),
+ /*isVarArg=*/ false);
+ Function *localsize =
+ dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft));
+ gv = M->getGlobalVariable("_local_size_x");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 0)),
+ gv);
+ gv = M->getGlobalVariable("_local_size_y");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 1)),
+ gv);
+ gv = M->getGlobalVariable("_local_size_z");
+ builder.CreateStore(builder.CreateCall(localsize,
+ ConstantInt::get(IntegerType::get(M->getContext(), size_t_width), 2)),
+ gv);
+}
+
diff --git a/src/llvmopencl/Kernel.h b/src/llvmopencl/Kernel.h
new file mode 100644
index 0000000..5337b54
--- /dev/null
+++ b/src/llvmopencl/Kernel.h
@@ -0,0 +1,54 @@
+// Class for kernels, a special kind of function.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_KERNEL_H
+#define _POCL_KERNEL_H
+
+#include "ParallelRegion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+namespace pocl {
+
+ class Kernel : public llvm::Function {
+ public:
+ void getExitBlocks(llvm::SmallVectorImpl<BarrierBlock *> &B);
+ ParallelRegion *createParallelRegionBefore(BarrierBlock *B);
+
+ ParallelRegion::ParallelRegionVector*
+ getParallelRegions(llvm::LoopInfo *LI);
+
+ void addLocalSizeInitCode(size_t LocalSizeX, size_t LocalSizeY, size_t LocalSizeZ);
+
+ static bool isKernel(const llvm::Function &F);
+
+ static bool classof(const Kernel *) { return true; }
+ // We assume any function can be a kernel. This could be used
+ // to check for metadata (but would need to be overrideable somehow
+ // to honor the forced kernel name(s) parameter in command line.
+ static bool classof(const llvm::Function *) { return true; }
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/LLVMUtils.cc b/src/llvmopencl/LLVMUtils.cc
new file mode 100644
index 0000000..aeb02d7
--- /dev/null
+++ b/src/llvmopencl/LLVMUtils.cc
@@ -0,0 +1,90 @@
+// Implementation of LLVMUtils, useful common LLVM-related functionality.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "LLVMUtils.h"
+
+#include "config.h"
+
+#ifdef LLVM_3_2
+#include <llvm/Module.h>
+#include <llvm/Metadata.h>
+#else
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Metadata.h>
+#endif
+
+using namespace llvm;
+
+/**
+ * Regenerates the metadata that points to the original kernel
+ * (of which finger print was modified) to point to the new
+ * kernel.
+ *
+ * Only checks if the first operand of the metadata is the kernel
+ * function.
+ */
+void
+regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels)
+{
+ // reproduce the opencl.kernel_wg_size_info metadata
+ NamedMDNode *wg_sizes = M.getNamedMetadata("opencl.kernel_wg_size_info");
+ if (wg_sizes != NULL && wg_sizes->getNumOperands() > 0)
+ {
+ for (std::size_t mni = 0; mni < wg_sizes->getNumOperands(); ++mni)
+ {
+ MDNode *wgsizeMD = dyn_cast<MDNode>(wg_sizes->getOperand(mni));
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end(); i != e; ++i)
+ {
+ Function *old_kernel = (*i).first;
+ Function *new_kernel = (*i).second;
+ if (old_kernel == new_kernel || wgsizeMD->getNumOperands() == 0 ||
+ dyn_cast<Function>(wgsizeMD->getOperand(0)) != old_kernel)
+ continue;
+ // found a wg size metadata that points to the old kernel, copy its
+ // operands except the first one to a new MDNode
+ SmallVector<Value*, 8> operands;
+ operands.push_back(new_kernel);
+ for (unsigned opr = 1; opr < wgsizeMD->getNumOperands(); ++opr)
+ {
+ operands.push_back(wgsizeMD->getOperand(opr));
+ }
+ MDNode *new_wg_md = MDNode::get(M.getContext(), operands);
+ wg_sizes->addOperand(new_wg_md);
+ }
+ }
+ }
+
+ // reproduce the opencl.kernels metadata
+ NamedMDNode *nmd = M.getNamedMetadata("opencl.kernels");
+ if (nmd)
+ M.eraseNamedMetadata(nmd);
+
+ nmd = M.getOrInsertNamedMetadata("opencl.kernels");
+ for (FunctionMapping::const_iterator i = kernels.begin(),
+ e = kernels.end();
+ i != e; ++i) {
+ MDNode *md = MDNode::get(M.getContext(), ArrayRef<Value *>((*i).second));
+ nmd->addOperand(md);
+ }
+}
+
diff --git a/src/llvmopencl/LLVMUtils.h b/src/llvmopencl/LLVMUtils.h
new file mode 100644
index 0000000..e6a89db
--- /dev/null
+++ b/src/llvmopencl/LLVMUtils.h
@@ -0,0 +1,38 @@
+// Header for LLVMUtils, useful common LLVM-related functionality.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_LLVM_UTILS_H
+#define _POCL_LLVM_UTILS_H
+
+#include <map>
+
+namespace llvm {
+ class Module;
+ class Function;
+}
+
+typedef std::map<llvm::Function*, llvm::Function*> FunctionMapping;
+
+void
+regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels);
+
+#endif
diff --git a/src/llvmopencl/LoopBarriers.cc b/src/llvmopencl/LoopBarriers.cc
new file mode 100644
index 0000000..5e4965f
--- /dev/null
+++ b/src/llvmopencl/LoopBarriers.cc
@@ -0,0 +1,194 @@
+// LLVM loop pass that adds required barriers to loops.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// 2012-2014 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <iostream>
+
+#include "LoopBarriers.h"
+#include "Barrier.h"
+#include "Workgroup.h"
+
+//#define DEBUG_LOOP_BARRIERS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<LoopBarriers> X("loop-barriers",
+ "Add needed barriers to loops");
+}
+
+char LoopBarriers::ID = 0;
+
+void
+LoopBarriers::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+}
+
+bool
+LoopBarriers::runOnLoop(Loop *L, LPPassManager &LPM)
+{
+ if (!Workgroup::isKernelToProcess(*L->getHeader()->getParent()))
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+
+ bool changed = ProcessLoop(L, LPM);
+
+ DT->verifyAnalysis();
+
+ return changed;
+}
+
+
+bool
+LoopBarriers::ProcessLoop(Loop *L, LPPassManager &LPM)
+{
+ bool isBLoop = false;
+ bool changed = false;
+
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && !isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+ isBLoop = true;
+ break;
+ }
+ }
+ }
+
+ for (Loop::block_iterator i = L->block_begin(), e = L->block_end();
+ i != e && isBLoop; ++i) {
+ for (BasicBlock::iterator j = (*i)->begin(), e = (*i)->end();
+ j != e; ++j) {
+ if (isa<Barrier>(j)) {
+
+ // Found a barrier in this loop:
+ // 1) add a barrier in the loop header.
+ // 2) add a barrier in the latches
+
+ // Add a barrier on the preheader to ensure all WIs reach
+ // the loop header with all the previous code already
+ // executed.
+ BasicBlock *preheader = L->getLoopPreheader();
+ assert((preheader != NULL) && "Non-canonicalized loop found!\n");
+#ifdef DEBUG_LOOP_BARRIERS
+ std::cerr << "### adding to preheader BB" << std::endl;
+ preheader->dump();
+ std::cerr << "### before instr" << std::endl;
+ preheader->getTerminator()->dump();
+#endif
+ Barrier::Create(preheader->getTerminator());
+ preheader->setName(preheader->getName() + ".loopbarrier");
+
+ // Add a barrier after the PHI nodes on the header (the replicated
+ // headers will be merged afterwards).
+ BasicBlock *header = L->getHeader();
+ if (header->getFirstNonPHI() != &header->front()) {
+ Barrier::Create(header->getFirstNonPHI());
+ header->setName(header->getName() + ".phibarrier");
+ // Split the block to create a replicable region of
+ // the loop contents in case the phi node contains a
+ // branch (that can be to inside the region).
+ // if (header->getTerminator()->getNumSuccessors() > 1)
+ // SplitBlock(header, header->getTerminator(), this);
+ }
+
+ // Add the barriers on the exiting block and the latches,
+ // which might not always be the same if there is computation
+ // after the exit decision.
+ BasicBlock *brexit = L->getExitingBlock();
+ if (brexit != NULL) {
+ Barrier::Create(brexit->getTerminator());
+ brexit->setName(brexit->getName() + ".brexitbarrier");
+ }
+
+ BasicBlock *latch = L->getLoopLatch();
+ if (latch != NULL && brexit != latch) {
+ // This loop has only one latch. Do not check for dominance, we
+ // are probably running before BTR.
+ Barrier::Create(latch->getTerminator());
+ latch->setName(latch->getName() + ".latchbarrier");
+ return changed;
+ }
+
+ // Modified code from llvm::LoopBase::getLoopLatch to
+ // go trough all the latches.
+ BasicBlock *Header = L->getHeader();
+ typedef GraphTraits<Inverse<BasicBlock *> > InvBlockTraits;
+ InvBlockTraits::ChildIteratorType PI = InvBlockTraits::child_begin(Header);
+ InvBlockTraits::ChildIteratorType PE = InvBlockTraits::child_end(Header);
+ BasicBlock *Latch = NULL;
+ for (; PI != PE; ++PI) {
+ InvBlockTraits::NodeType *N = *PI;
+ if (L->contains(N)) {
+ Latch = N;
+ // Latch found in the loop, see if the barrier dominates it
+ // (otherwise if might no even belong to this "tail", see
+ // forifbarrier1 graph test).
+ if (DT->dominates(j->getParent(), Latch)) {
+ Barrier::Create(Latch->getTerminator());
+ Latch->setName(Latch->getName() + ".latchbarrier");
+ }
+ }
+ }
+ return true;
+ }
+ }
+ }
+
+ /* This is a loop without a barrier. Ensure we have a non-barrier
+ block as a preheader so we can replicate the loop as a whole.
+
+ If the block has proper instructions after the barrier, it
+ will be split in CanonicalizeBarriers. */
+ BasicBlock *preheader = L->getLoopPreheader();
+ assert((preheader != NULL) && "Non-canonicalized loop found!\n");
+ TerminatorInst *t = preheader->getTerminator();
+ Instruction *prev = NULL;
+ if (&preheader->front() != t)
+ prev = t->getPrevNode();
+ if (prev && isa<Barrier>(prev))
+ {
+ BasicBlock *new_b = SplitBlock(preheader, t, this);
+ new_b->setName(preheader->getName() + ".postbarrier_dummy");
+ return true;
+ }
+
+ return changed;
+}
+
diff --git a/src/llvmopencl/LoopBarriers.h b/src/llvmopencl/LoopBarriers.h
new file mode 100644
index 0000000..6d80de6
--- /dev/null
+++ b/src/llvmopencl/LoopBarriers.h
@@ -0,0 +1,47 @@
+// Header for LoopBarriers.cc function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_LOOP_BARRIERS_H
+#define POCL_LOOP_BARRIERS_H
+
+#include "llvm/Analysis/LoopPass.h"
+#include <set>
+
+namespace pocl {
+ class LoopBarriers : public llvm::LoopPass {
+
+ public:
+ static char ID;
+
+ LoopBarriers() : LoopPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+
+ private:
+ llvm::DominatorTree *DT;
+
+ bool ProcessLoop(llvm::Loop *L, llvm::LPPassManager &LPM);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/Makefile.am b/src/llvmopencl/Makefile.am
new file mode 100644
index 0000000..881a35c
--- /dev/null
+++ b/src/llvmopencl/Makefile.am
@@ -0,0 +1,53 @@
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/llvmopencl.
+#
+# Copyright (c) 2011 Universidad Rey Juan Carlos
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+pkglib_LTLIBRARIES = llvmopencl.la
+
+AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags`
+AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags`
+llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION}
+
+llvmopencl_la_SOURCES = Barrier.h \
+ BarrierBlock.h BarrierBlock.cc \
+ Kernel.h Kernel.cc \
+ ParallelRegion.h ParallelRegion.cc \
+ CanonicalizeBarriers.h CanonicalizeBarriers.cc \
+ LoopBarriers.h LoopBarriers.cc \
+ GenerateHeader.cc Workgroup.h Workgroup.cc \
+ BarrierTailReplication.h BarrierTailReplication.cc \
+ Flatten.cc IsolateRegions.h IsolateRegions.cc \
+ WorkitemReplication.h WorkitemReplication.cc \
+ ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \
+ WorkItemAliasAnalysis.cc WIVectorize.cc \
+ WorkitemHandler.h WorkitemHandler.cc \
+ WorkitemLoops.h WorkitemLoops.cc \
+ PHIsToAllocas.h PHIsToAllocas.cc \
+ BreakConstantGEPs.h BreakConstantGEPs.cpp \
+ WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \
+ AllocasToEntry.h AllocasToEntry.cc \
+ TargetAddressSpaces.h TargetAddressSpaces.cc \
+ LLVMUtils.cc LLVMUtils.h \
+ VariableUniformityAnalysis.h VariableUniformityAnalysis.cc
+
+#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@
diff --git a/src/llvmopencl/Makefile.in b/src/llvmopencl/Makefile.in
new file mode 100644
index 0000000..e4dd24b
--- /dev/null
+++ b/src/llvmopencl/Makefile.in
@@ -0,0 +1,822 @@
+# Makefile.in generated by automake 1.14 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Process this file with automake to produce Makefile.in (in this,
+# and all subdirectories).
+# Makefile.am for pocl/lib/llvmopencl.
+#
+# Copyright (c) 2011 Universidad Rey Juan Carlos
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+VPATH = @srcdir@
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+ case $${target_option-} in \
+ ?) ;; \
+ *) echo "am__make_running_with_option: internal error: invalid" \
+ "target option '$${target_option-}' specified" >&2; \
+ exit 1;; \
+ esac; \
+ has_opt=no; \
+ sane_makeflags=$$MAKEFLAGS; \
+ if $(am__is_gnu_make); then \
+ sane_makeflags=$$MFLAGS; \
+ else \
+ case $$MAKEFLAGS in \
+ *\\[\ \ ]*) \
+ bs=\\; \
+ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+ | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
+ esac; \
+ fi; \
+ skip_next=no; \
+ strip_trailopt () \
+ { \
+ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+ }; \
+ for flg in $$sane_makeflags; do \
+ test $$skip_next = yes && { skip_next=no; continue; }; \
+ case $$flg in \
+ *=*|--*) continue;; \
+ -*I) strip_trailopt 'I'; skip_next=yes;; \
+ -*I?*) strip_trailopt 'I';; \
+ -*O) strip_trailopt 'O'; skip_next=yes;; \
+ -*O?*) strip_trailopt 'O';; \
+ -*l) strip_trailopt 'l'; skip_next=yes;; \
+ -*l?*) strip_trailopt 'l';; \
+ -[dEDm]) skip_next=yes;; \
+ -[JT]) skip_next=yes;; \
+ esac; \
+ case $$flg in \
+ *$$target_option*) has_opt=yes; break;; \
+ esac; \
+ done; \
+ test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+subdir = lib/llvmopencl
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+ $(top_srcdir)/config/depcomp
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
+ $(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+ $(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+ $(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/acinclude.m4 \
+ $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+ $(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+ *) f=$$p;; \
+ esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+ for p in $$list; do echo "$$p $$p"; done | \
+ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+ if (++n[$$2] == $(am__install_max)) \
+ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+ END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+ test -z "$$files" \
+ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+ $(am__cd) "$$dir" && rm -f $$files; }; \
+ }
+am__installdirs = "$(DESTDIR)$(pkglibdir)"
+LTLIBRARIES = $(pkglib_LTLIBRARIES)
+llvmopencl_la_DEPENDENCIES =
+am_llvmopencl_la_OBJECTS = BarrierBlock.lo Kernel.lo ParallelRegion.lo \
+ CanonicalizeBarriers.lo LoopBarriers.lo GenerateHeader.lo \
+ Workgroup.lo BarrierTailReplication.lo Flatten.lo \
+ IsolateRegions.lo WorkitemReplication.lo \
+ ImplicitLoopBarriers.lo WorkItemAliasAnalysis.lo \
+ WIVectorize.lo WorkitemHandler.lo WorkitemLoops.lo \
+ PHIsToAllocas.lo BreakConstantGEPs.lo \
+ WorkitemHandlerChooser.lo AllocasToEntry.lo \
+ TargetAddressSpaces.lo LLVMUtils.lo \
+ VariableUniformityAnalysis.lo
+llvmopencl_la_OBJECTS = $(am_llvmopencl_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo " GEN " $@;
+am__v_GEN_1 =
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 =
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/config/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CXXFLAGS) $(CXXFLAGS)
+AM_V_CXX = $(am__v_CXX_@AM_V@)
+am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
+am__v_CXX_0 = @echo " CXX " $@;
+am__v_CXX_1 =
+CXXLD = $(CXX)
+CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
+ $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
+am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
+am__v_CXXLD_0 = @echo " CXXLD " $@;
+am__v_CXXLD_1 =
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+ $(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo " CC " $@;
+am__v_CC_1 =
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+ $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo " CCLD " $@;
+am__v_CCLD_1 =
+SOURCES = $(llvmopencl_la_SOURCES)
+DIST_SOURCES = $(llvmopencl_la_SOURCES)
+am__can_run_installinfo = \
+ case $$AM_UPDATE_INFO_DIR in \
+ n|no|NO) false;; \
+ *) (install-info --version) >/dev/null 2>&1;; \
+ esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates. Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+ BEGIN { nonempty = 0; } \
+ { items[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique. This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+ list='$(am__tagged_files)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
+BOOST_LDFLAGS = @BOOST_LDFLAGS@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CLANG = @CLANG@
+CLANGXX = @CLANGXX@
+CLFLAGS = @CLFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+FGREP = @FGREP@
+GLEW_CFLAGS = @GLEW_CFLAGS@
+GLEW_LIBS = @GLEW_LIBS@
+GREP = @GREP@
+HOST = @HOST@
+HOST_CLANG_FLAGS = @HOST_CLANG_FLAGS@
+HOST_CPU = @HOST_CPU@
+HOST_LD_FLAGS = @HOST_LD_FLAGS@
+HOST_LLC_FLAGS = @HOST_LLC_FLAGS@
+HOST_SIZEOF_DOUBLE = @HOST_SIZEOF_DOUBLE@
+HOST_SIZEOF_HALF = @HOST_SIZEOF_HALF@
+HOST_SIZEOF_LONG = @HOST_SIZEOF_LONG@
+HOST_SIZEOF_VOID_P = @HOST_SIZEOF_VOID_P@
+HWLOC_CFLAGS = @HWLOC_CFLAGS@
+HWLOC_LIBS = @HWLOC_LIBS@
+ICD_LD_FLAGS = @ICD_LD_FLAGS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+KERNEL_COMPILER_LIB_VERSION = @KERNEL_COMPILER_LIB_VERSION@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LD_FLAGS_BIN = @LD_FLAGS_BIN@
+LIBOBJS = @LIBOBJS@
+LIBRARY_SUFFIX = @LIBRARY_SUFFIX@
+LIBS = @LIBS@
+LIBSPE_CFLAGS = @LIBSPE_CFLAGS@
+LIBSPE_LIBS = @LIBSPE_LIBS@
+LIBTOOL = @LIBTOOL@
+LIB_VERSION = @LIB_VERSION@
+LIPO = @LIPO@
+LLC = @LLC@
+LLVM_AR = @LLVM_AR@
+LLVM_AS = @LLVM_AS@
+LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_LINK = @LLVM_LINK@
+LLVM_OPT = @LLVM_OPT@
+LLVM_RANLIB = @LLVM_RANLIB@
+LLVM_VERSION = @LLVM_VERSION@
+LN_S = @LN_S@
+LTDL_LIBS = @LTDL_LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OCL_ICD_CFLAGS = @OCL_ICD_CFLAGS@
+OCL_ICD_LIBS = @OCL_ICD_LIBS@
+OCL_KERNEL_TARGET = @OCL_KERNEL_TARGET@
+OCL_TARGETS = @OCL_TARGETS@
+OPENCL_CFLAGS = @OPENCL_CFLAGS@
+OPENCL_CMAKE = @OPENCL_CMAKE@
+OPENCL_EXTLIBS = @OPENCL_EXTLIBS@
+OPENCL_LIBS = @OPENCL_LIBS@
+OPT = @OPT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+POAT_TESTSUITES = @POAT_TESTSUITES@
+POCL_DEVICE_ADDRESS_BITS = @POCL_DEVICE_ADDRESS_BITS@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+RANLIB = @RANLIB@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+TARGET = @TARGET@
+TARGET_CLANG_FLAGS = @TARGET_CLANG_FLAGS@
+TARGET_CPU = @TARGET_CPU@
+TARGET_LLC_FLAGS = @TARGET_LLC_FLAGS@
+TARGET_SIZEOF_DOUBLE = @TARGET_SIZEOF_DOUBLE@
+TARGET_SIZEOF_HALF = @TARGET_SIZEOF_HALF@
+TARGET_SIZEOF_LONG = @TARGET_SIZEOF_LONG@
+TARGET_SIZEOF_VOID_P = @TARGET_SIZEOF_VOID_P@
+TCECC = @TCECC@
+TCE_AVAILABLE = @TCE_AVAILABLE@
+TCE_CONFIG = @TCE_CONFIG@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acx_pthread_config = @acx_pthread_config@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+pkglib_LTLIBRARIES = llvmopencl.la
+AM_CXXFLAGS = -I@top_srcdir@/fix-include -I@top_srcdir@/include `@LLVM_CONFIG@ --cxxflags`
+AM_LDFLAGS = -module -export-dynamic -version-info ${KERNEL_COMPILER_LIB_VERSION} `@LLVM_CONFIG@ --ldflags`
+llvmopencl_la_LIBADD = -lLLVM-${LLVM_VERSION}
+llvmopencl_la_SOURCES = Barrier.h \
+ BarrierBlock.h BarrierBlock.cc \
+ Kernel.h Kernel.cc \
+ ParallelRegion.h ParallelRegion.cc \
+ CanonicalizeBarriers.h CanonicalizeBarriers.cc \
+ LoopBarriers.h LoopBarriers.cc \
+ GenerateHeader.cc Workgroup.h Workgroup.cc \
+ BarrierTailReplication.h BarrierTailReplication.cc \
+ Flatten.cc IsolateRegions.h IsolateRegions.cc \
+ WorkitemReplication.h WorkitemReplication.cc \
+ ImplicitLoopBarriers.h ImplicitLoopBarriers.cc \
+ WorkItemAliasAnalysis.cc WIVectorize.cc \
+ WorkitemHandler.h WorkitemHandler.cc \
+ WorkitemLoops.h WorkitemLoops.cc \
+ PHIsToAllocas.h PHIsToAllocas.cc \
+ BreakConstantGEPs.h BreakConstantGEPs.cpp \
+ WorkitemHandlerChooser.h WorkitemHandlerChooser.cc \
+ AllocasToEntry.h AllocasToEntry.cc \
+ TargetAddressSpaces.h TargetAddressSpaces.cc \
+ LLVMUtils.cc LLVMUtils.h \
+ VariableUniformityAnalysis.h VariableUniformityAnalysis.cc
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .cc .cpp .lo .o .obj
+$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
+ @for dep in $?; do \
+ case '$(am__configure_deps)' in \
+ *$$dep*) \
+ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+ && { if test -f $@; then exit 0; else break; fi; }; \
+ exit 1;; \
+ esac; \
+ done; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign lib/llvmopencl/Makefile'; \
+ $(am__cd) $(top_srcdir) && \
+ $(AUTOMAKE) --foreign lib/llvmopencl/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ @case '$?' in \
+ *config.status*) \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+ *) \
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+ esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: $(am__configure_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): $(am__aclocal_m4_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+install-pkglibLTLIBRARIES: $(pkglib_LTLIBRARIES)
+ @$(NORMAL_INSTALL)
+ @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \
+ list2=; for p in $$list; do \
+ if test -f $$p; then \
+ list2="$$list2 $$p"; \
+ else :; fi; \
+ done; \
+ test -z "$$list2" || { \
+ echo " $(MKDIR_P) '$(DESTDIR)$(pkglibdir)'"; \
+ $(MKDIR_P) "$(DESTDIR)$(pkglibdir)" || exit 1; \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(pkglibdir)'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(pkglibdir)"; \
+ }
+
+uninstall-pkglibLTLIBRARIES:
+ @$(NORMAL_UNINSTALL)
+ @list='$(pkglib_LTLIBRARIES)'; test -n "$(pkglibdir)" || list=; \
+ for p in $$list; do \
+ $(am__strip_dir) \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(pkglibdir)/$$f'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(pkglibdir)/$$f"; \
+ done
+
+clean-pkglibLTLIBRARIES:
+ -test -z "$(pkglib_LTLIBRARIES)" || rm -f $(pkglib_LTLIBRARIES)
+ @list='$(pkglib_LTLIBRARIES)'; \
+ locs=`for p in $$list; do echo $$p; done | \
+ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+ sort -u`; \
+ test -z "$$locs" || { \
+ echo rm -f $${locs}; \
+ rm -f $${locs}; \
+ }
+
+llvmopencl.la: $(llvmopencl_la_OBJECTS) $(llvmopencl_la_DEPENDENCIES) $(EXTRA_llvmopencl_la_DEPENDENCIES)
+ $(AM_V_CXXLD)$(CXXLINK) -rpath $(pkglibdir) $(llvmopencl_la_OBJECTS) $(llvmopencl_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT)
+
+distclean-compile:
+ -rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/AllocasToEntry.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierBlock.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BarrierTailReplication.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/BreakConstantGEPs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/CanonicalizeBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Flatten.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/GenerateHeader.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ImplicitLoopBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/IsolateRegions.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Kernel.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LLVMUtils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/LoopBarriers.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/PHIsToAllocas.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ParallelRegion.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TargetAddressSpaces.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/VariableUniformityAnalysis.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WIVectorize.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkItemAliasAnalysis.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Workgroup.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandler.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemHandlerChooser.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemLoops.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/WorkitemReplication.Plo@am__quote@
+
+.cc.o:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cc.obj:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.cc.lo:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+
+.cpp.o:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cpp.obj:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.cpp.lo:
+@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+ $(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+ set x; \
+ here=`pwd`; \
+ $(am__define_uniq_tagged_files); \
+ shift; \
+ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+ test -n "$$unique" || unique=$$empty_fix; \
+ if test $$# -gt 0; then \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ "$$@" $$unique; \
+ else \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$unique; \
+ fi; \
+ fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+ $(am__define_uniq_tagged_files); \
+ test -z "$(CTAGS_ARGS)$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && $(am__cd) $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+ list='$(am__tagged_files)'; \
+ case "$(srcdir)" in \
+ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+ *) sdir=$(subdir)/$(srcdir) ;; \
+ esac; \
+ for i in $$list; do \
+ if test -f "$$i"; then \
+ echo "$(subdir)/$$i"; \
+ else \
+ echo "$$sdir/$$i"; \
+ fi; \
+ done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ list='$(DISTFILES)'; \
+ dist_files=`for file in $$list; do echo $$file; done | \
+ sed -e "s|^$$srcdirstrip/||;t" \
+ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+ case $$dist_files in \
+ */*) $(MKDIR_P) `echo "$$dist_files" | \
+ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+ sort -u` ;; \
+ esac; \
+ for file in $$dist_files; do \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ if test -d $$d/$$file; then \
+ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test -d "$(distdir)/$$file"; then \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ else \
+ test -f "$(distdir)/$$file" \
+ || cp -p $$d/$$file "$(distdir)/$$file" \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+ for dir in "$(DESTDIR)$(pkglibdir)"; do \
+ test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+ done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ if test -z '$(STRIP)'; then \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ install; \
+ else \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+ fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-pkglibLTLIBRARIES \
+ mostlyclean-am
+
+distclean: distclean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am: install-pkglibLTLIBRARIES
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -rf ./$(DEPDIR)
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am: uninstall-pkglibLTLIBRARIES
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+ clean-libtool clean-pkglibLTLIBRARIES cscopelist-am ctags \
+ ctags-am distclean distclean-compile distclean-generic \
+ distclean-libtool distclean-tags distdir dvi dvi-am html \
+ html-am info info-am install install-am install-data \
+ install-data-am install-dvi install-dvi-am install-exec \
+ install-exec-am install-html install-html-am install-info \
+ install-info-am install-man install-pdf install-pdf-am \
+ install-pkglibLTLIBRARIES install-ps install-ps-am \
+ install-strip installcheck installcheck-am installdirs \
+ maintainer-clean maintainer-clean-generic mostlyclean \
+ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+ pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am \
+ uninstall-pkglibLTLIBRARIES
+
+
+#llvmopencl_la_LIBADD += @LIBS_LLVMTRANSFORMUTILS@
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/src/llvmopencl/PHIsToAllocas.cc b/src/llvmopencl/PHIsToAllocas.cc
new file mode 100644
index 0000000..a414412
--- /dev/null
+++ b/src/llvmopencl/PHIsToAllocas.cc
@@ -0,0 +1,144 @@
+// LLVM function pass to convert all PHIs to allocas.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "PHIsToAllocas.h"
+#include "Workgroup.h"
+#include "WorkitemHandlerChooser.h"
+#include "WorkitemLoops.h"
+
+#include "config.h"
+
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#endif
+
+namespace {
+ static
+ llvm::RegisterPass<pocl::PHIsToAllocas> X(
+ "phistoallocas", "Convert all PHI nodes to allocas");
+}
+
+namespace pocl {
+
+char PHIsToAllocas::ID = 0;
+
+using namespace llvm;
+
+void
+PHIsToAllocas::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+ AU.addPreserved<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+PHIsToAllocas::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ /* Skip PHIsToAllocas when we are not creating the work item loops,
+ as leads to worse code without benefits for the full replication method.
+ */
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
+ return false;
+
+ typedef std::vector<llvm::Instruction* > InstructionVec;
+
+ InstructionVec PHIs;
+
+ for (Function::iterator bb = F.begin(); bb != F.end(); ++bb) {
+ for (BasicBlock::iterator p = bb->begin();
+ p != bb->end(); ++p)
+ {
+ Instruction* instr = p;
+ if (isa<PHINode>(instr))
+ {
+ PHIs.push_back(instr);
+ }
+ }
+
+ }
+
+ bool changed = false;
+ for (InstructionVec::iterator i = PHIs.begin(); i != PHIs.end();
+ ++i)
+ {
+ Instruction *instr = *i;
+ BreakPHIToAllocas(dyn_cast<PHINode>(instr));
+ changed = true;
+ }
+ return changed;
+
+}
+
+/**
+ * Convert a PHI to a read from a stack value and all the sources to
+ * writes to the same stack value.
+ *
+ * Used to fix context save/restore issues with regions with PHI nodes in the
+ * entry node (usually due to the use of work group scope variables such as
+ * B-loop iteration variables). In case of PHI nodes at region entries, we cannot
+ * just insert the context restore code because it is assumed there are no
+ * non-phi Instructions before PHIs which the context restore
+ * code constitutes to. Secondly, in case the PHINode is at a
+ * region entry (e.g. a B-Loop) adding new basic blocks before it would
+ * break the assumption of single entry regions.
+ */
+llvm::Instruction *
+PHIsToAllocas::BreakPHIToAllocas(PHINode* phi)
+{
+ std::string allocaName = std::string(phi->getName().str()) + ".ex_phi";
+
+ llvm::Function *function = phi->getParent()->getParent();
+ IRBuilder<> builder(function->getEntryBlock().getFirstInsertionPt());
+
+ llvm::Instruction *alloca =
+ builder.CreateAlloca(phi->getType(), 0, allocaName);
+
+ for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+ ++incoming)
+ {
+ Value *val = phi->getIncomingValue(incoming);
+ BasicBlock *incomingBB = phi->getIncomingBlock(incoming);
+ builder.SetInsertPoint(incomingBB->getTerminator());
+ builder.CreateStore(val, alloca);
+ }
+
+ builder.SetInsertPoint(phi);
+
+ llvm::Instruction *loadedValue = builder.CreateLoad(alloca);
+ phi->replaceAllUsesWith(loadedValue);
+ phi->eraseFromParent();
+ return loadedValue;
+}
+
+
+}
diff --git a/src/llvmopencl/PHIsToAllocas.h b/src/llvmopencl/PHIsToAllocas.h
new file mode 100644
index 0000000..819dcfc
--- /dev/null
+++ b/src/llvmopencl/PHIsToAllocas.h
@@ -0,0 +1,56 @@
+// Header for PHIsToAllocas function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_PHIS_TO_ALLOCAS_H
+#define _POCL_PHIS_TO_ALLOCAS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+#include "llvm/Pass.h"
+
+namespace llvm {
+ class Instruction;
+ class PHINode;
+}
+
+namespace pocl {
+ class Workgroup;
+
+ class PHIsToAllocas : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ PHIsToAllocas() : llvm::FunctionPass(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ llvm::Instruction *BreakPHIToAllocas(llvm::PHINode* phi);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/ParallelRegion.cc b/src/llvmopencl/ParallelRegion.cc
new file mode 100644
index 0000000..72d89c1
--- /dev/null
+++ b/src/llvmopencl/ParallelRegion.cc
@@ -0,0 +1,809 @@
+// Class definition for parallel regions, a group of BasicBlocks that
+// each kernel should run in parallel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "ParallelRegion.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include <set>
+#include <sstream>
+#include <map>
+#include <algorithm>
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+//#define DEBUG_REMAP
+//#define DEBUG_REPLICATE
+//#define DEBUG_PURGE
+
+#include <iostream>
+
+int ParallelRegion::idGen = 0;
+
+
+ParallelRegion::ParallelRegion(int forcedRegionId) :
+ std::vector<llvm::BasicBlock *>(),
+ LocalIDXLoadInstr(NULL), LocalIDYLoadInstr(NULL), LocalIDZLoadInstr(NULL),
+ exitIndex_(0), entryIndex_(0), pRegionId(forcedRegionId)
+{
+ if (forcedRegionId == -1)
+ pRegionId = idGen++;
+}
+
+/**
+ * Ensure all variables are named so they will be replicated and renamed
+ * correctly.
+ */
+void
+ParallelRegion::GenerateTempNames(llvm::BasicBlock *bb)
+{
+ for (llvm::BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i)
+ {
+ llvm::Instruction *instr = i;
+ if (instr->hasName() || !instr->isUsedOutsideOfBlock(bb)) continue;
+ int tempCounter = 0;
+ std::string tempName = "";
+ do {
+ std::ostringstream name;
+ name << ".pocl_temp." << tempCounter;
+ ++tempCounter;
+ tempName = name.str();
+ } while (bb->getParent()->getValueSymbolTable().lookup(tempName) != NULL);
+ instr->setName(tempName);
+ }
+}
+
+// BarrierBlock *
+// ParallelRegion::getEntryBarrier()
+// {
+// BasicBlock *entry = front();
+// BasicBlock *barrier = entry->getSinglePredecessor();
+
+// return cast<BarrierBlock> (barrier);
+// }
+
+ParallelRegion *
+ParallelRegion::replicate(ValueToValueMapTy &map,
+ const Twine &suffix = "")
+{
+ ParallelRegion *new_region = new ParallelRegion(pRegionId);
+
+ /* Because ParallelRegions are all replicated before they
+ are attached to the function, it can happen that
+ the same BB is replicated multiple times and it gets
+ the same name (only the BB name will be autorenamed
+ by LLVM). This causes the variable references to become
+ broken. This hack ensures the BB suffixes are unique
+ before cloning so each path gets their own value
+ names. Split points can be such paths.*/
+ static std::map<std::string, int> cloneCounts;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock *block = *i;
+ GenerateTempNames(block);
+ std::ostringstream suf;
+ suf << suffix.str();
+ std::string block_name = block->getName().str() + "." + suffix.str();
+ if (cloneCounts[block_name] > 0)
+ {
+ suf << ".pocl_" << cloneCounts[block_name];
+ }
+ BasicBlock *new_block = CloneBasicBlock(block, map, suf.str());
+ cloneCounts[block_name]++;
+ // Insert the block itself into the map.
+ map[block] = new_block;
+ new_region->push_back(new_block);
+
+#ifdef DEBUG_REPLICATE
+ std::cerr << "### clonee block:" << std::endl;
+ block->dump();
+ std::cerr << endl << "### cloned block: " << std::endl;
+ new_block->dump();
+#endif
+ }
+
+ new_region->exitIndex_ = exitIndex_;
+ new_region->entryIndex_ = entryIndex_;
+ /* Remap here to get local variables fixed before they
+ are (possibly) overwritten by another clone of the
+ same BB. */
+ new_region->remap(map);
+
+#ifdef DEBUG_REPLICATE
+ Verify();
+#endif
+ LocalizeIDLoads();
+
+ return new_region;
+}
+
+void
+ParallelRegion::remap(ValueToValueMapTy &map)
+{
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+
+#ifdef DEBUG_REMAP
+ std::cerr << "### block before remap:" << std::endl;
+ (*i)->dump();
+#endif
+
+ for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end();
+ ii != ee; ++ii)
+ RemapInstruction(ii, map,
+ RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+
+#ifdef DEBUG_REMAP
+ std::cerr << endl << "### block after remap: " << std::endl;
+ (*i)->dump();
+#endif
+ }
+}
+
+void
+ParallelRegion::chainAfter(ParallelRegion *region)
+{
+ /* If we are replicating a conditional barrier
+ region, the last block can be an unreachable
+ block to mark the impossible path. Skip
+ it and choose the correct branch instead.
+
+ TODO: why have the unreachable block there the
+ first place? Could we just not add it and fix
+ the branch? */
+ BasicBlock *tail = region->exitBB();
+ TerminatorInst *t = tail->getTerminator();
+ if (isa<UnreachableInst>(t))
+ {
+ tail = region->at(region->size() - 2);
+ t = tail->getTerminator();
+ }
+ if (t->getNumSuccessors() != 1)
+ {
+ std::cout << "!!! trying to chain region" << std::endl;
+ this->dumpNames();
+ std::cout << "!!! after region" << std::endl;
+ region->dumpNames();
+ t->getParent()->dump();
+
+ assert (t->getNumSuccessors() == 1);
+ }
+
+ BasicBlock *successor = t->getSuccessor(0);
+ Function::BasicBlockListType &bb_list =
+ successor->getParent()->getBasicBlockList();
+
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ bb_list.insertAfter(tail, *i);
+
+ t->setSuccessor(0, entryBB());
+
+ t = exitBB()->getTerminator();
+ assert (t->getNumSuccessors() == 1);
+ t->setSuccessor(0, successor);
+}
+
+void
+ParallelRegion::purge()
+{
+ SmallVector<BasicBlock *, 4> new_blocks;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+
+ // Exit block has a successor out of the region.
+ if (*i == exitBB())
+ continue;
+
+#ifdef DEBUG_PURGE
+ std::cerr << "### block before purge:" << std::endl;
+ (*i)->dump();
+#endif
+ TerminatorInst *t = (*i)->getTerminator();
+ for (unsigned ii = 0, ee = t->getNumSuccessors(); ii != ee; ++ii) {
+ BasicBlock *successor = t->getSuccessor(ii);
+ if (count(begin(), end(), successor) == 0) {
+ // This successor is not on the parallel region, purge.
+ iterator next_block = i;
+ ++next_block;
+ assert ((*i)->getParent() != NULL && *next_block != NULL);
+ BasicBlock *unreachable =
+ BasicBlock::Create((*i)->getContext(),
+ (*i)->getName() + ".unreachable",
+ (*i)->getParent(),
+ *next_block);
+ new UnreachableInst(unreachable->getContext(),
+ unreachable);
+ t->setSuccessor(ii, unreachable);
+ new_blocks.push_back(unreachable);
+ }
+ }
+#ifdef DEBUG_PURGE
+ std::cerr << std::endl << "### block after purge:" << std::endl;
+ (*i)->dump();
+#endif
+ }
+
+ // Add the new "unreachable" blocks to the
+ // region. We cannot do in the loop as it
+ // corrupts iterators.
+ insert(end(), new_blocks.begin(), new_blocks.end());
+}
+
+void
+ParallelRegion::insertLocalIdInit(llvm::BasicBlock* entry,
+ unsigned x,
+ unsigned y,
+ unsigned z)
+{
+ IRBuilder<> builder(entry, entry->getFirstInsertionPt());
+
+ Module *M = entry->getParent()->getParent();
+
+ int size_t_width = 32;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ GlobalVariable *gvx = M->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL);
+ if (gvx != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ x), gvx);
+
+ GlobalVariable *gvy = M->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL);
+ if (gvy != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ y), gvy);
+
+ GlobalVariable *gvz = M->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL);
+ if (gvz != NULL)
+ builder.CreateStore(ConstantInt::get(IntegerType::
+ get(M->getContext(), size_t_width),
+ z), gvz);
+}
+
+void
+ParallelRegion::insertPrologue(unsigned x,
+ unsigned y,
+ unsigned z)
+{
+ BasicBlock *entry = entryBB();
+ ParallelRegion::insertLocalIdInit(entry, x, y, z);
+}
+
+void
+ParallelRegion::dump()
+{
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ (*i)->dump();
+}
+
+void
+ParallelRegion::dumpNames()
+{
+ for (iterator i = begin(), e = end(); i != e; ++i)
+ {
+ std::cout << (*i)->getName().str();
+ if (entryBB() == (*i))
+ std::cout << "(EN)";
+ if (exitBB() == (*i))
+ std::cout << "(EX)";
+ std::cout << " ";
+ }
+ std::cout << std::endl;
+}
+
+ParallelRegion *
+ParallelRegion::Create(const SmallPtrSet<BasicBlock *, 8>& bbs, BasicBlock *entry, BasicBlock *exit)
+{
+ ParallelRegion *new_region = new ParallelRegion();
+
+ assert (entry != NULL);
+ assert (exit != NULL);
+
+ // This is done in two steps so order of the vector
+ // is the same as original function order.
+ Function *F = entry->getParent();
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ BasicBlock *b = i;
+ for (SmallPtrSetIterator<BasicBlock *> j = bbs.begin(); j != bbs.end(); ++j) {
+ if (*j == b) {
+ new_region->push_back(i);
+ if (entry == *j)
+ new_region->setEntryBBIndex(new_region->size() - 1);
+ else if (exit == *j)
+ new_region->setExitBBIndex(new_region->size() - 1);
+ break;
+ }
+ }
+ }
+
+ new_region->LocalizeIDLoads();
+
+ assert(new_region->Verify());
+
+ return new_region;
+}
+
+bool
+ParallelRegion::Verify()
+{
+ // Parallel region conditions:
+ // 1) Single entry, in entry block.
+ // 2) Single outgoing edge from exit block
+ // (other outgoing edges allowed, will be purged in replicas).
+ // 3) No barriers inside the region.
+
+ int entry_edges = 0;
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ for (pred_iterator ii(*i), ee(*i, true); ii != ee; ++ii) {
+ if (count(begin(), end(), *ii) == 0) {
+ if ((*i) != entryBB()) {
+ dumpNames();
+ std::cerr << "suspicious block: " << (*i)->getName().str() << std::endl;
+ std::cerr << "the entry is: " << entryBB()->getName().str() << std::endl;
+
+#if 0
+ (*i)->getParent()->viewCFG();
+#endif
+ assert(0 && "Incoming edges to non-entry block!");
+ return false;
+ } else if (!Barrier::hasBarrier(*ii)) {
+ (*i)->getParent()->viewCFG();
+ assert (0 && "Entry has edges from non-barrier blocks!");
+ return false;
+ }
+ ++entry_edges;
+ }
+ }
+
+ // if (entry_edges != 1) {
+ // assert(0 && "Parallel regions must be single entry!");
+ // return false;
+ // }
+
+ if (exitBB()->getTerminator()->getNumSuccessors() != 1) {
+ assert(0 && "Multiple outgoing edges from exit block!");
+ return false;
+ }
+
+ for (BasicBlock::iterator ii = (*i)->begin(), ee = (*i)->end();
+ ii != ee; ++ii) {
+ if (isa<Barrier> (ii)) {
+ assert(0 && "Barrier found inside parallel region!");
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Adds metadata to all the memory instructions to denote
+ * they originate from a parallel loop.
+ *
+ * Due to nested parallel loops, there can be multiple loop
+ * references.
+ *
+ * Format:
+ * llvm.mem.parallel_loop_access !0
+ *
+ * !0 { metadata !0 }
+ *
+ * In a 2-nested loop:
+ *
+ * llvm.mem.parallel_loop_access !0
+ *
+ * !0 { metadata !1, metadata !2}
+ * !1 { metadata !1 }
+ * !2 { metadata !2 }
+ */
+void
+ParallelRegion::AddParallelLoopMetadata(llvm::MDNode *identifier) {
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock* bb = *i;
+ for (BasicBlock::iterator ii = bb->begin(), ee = bb->end();
+ ii != ee; ii++) {
+ if (ii->mayReadOrWriteMemory()) {
+ std::vector<Value*> loopIds;
+ MDNode *oldIds = ii->getMetadata("llvm.mem.parallel_loop_access");
+ if (oldIds != NULL) {
+ for (unsigned i = 0; i < oldIds->getNumOperands(); ++i) {
+ loopIds.push_back(oldIds->getOperand(i));
+ }
+ }
+ loopIds.push_back(identifier);
+ ii->setMetadata("llvm.mem.parallel_loop_access",
+ MDNode::get(bb->getContext(), loopIds));
+ }
+ }
+ }
+}
+
+void
+ParallelRegion::AddIDMetadata(
+ llvm::LLVMContext& context,
+ std::size_t x,
+ std::size_t y,
+ std::size_t z) {
+
+ int counter = 1;
+ Value *v1[] = {
+ MDString::get(context, "WI_region"),
+ ConstantInt::get(Type::getInt32Ty(context), pRegionId)};
+ MDNode* mdRegion = MDNode::get(context, v1);
+ Value *v2[] = {
+ MDString::get(context, "WI_xyz"),
+ ConstantInt::get(Type::getInt32Ty(context), x),
+ ConstantInt::get(Type::getInt32Ty(context), y),
+ ConstantInt::get(Type::getInt32Ty(context), z)};
+ MDNode* mdXYZ = MDNode::get(context, v2);
+ Value *v[] = {
+ MDString::get(context, "WI_data"),
+ mdRegion,
+ mdXYZ};
+ MDNode* md = MDNode::get(context, v);
+
+ for (iterator i = begin(), e = end(); i != e; ++i) {
+ BasicBlock* bb = *i;
+ for (BasicBlock::iterator ii = bb->begin();
+ ii != bb->end(); ii++) {
+ Value *v3[] = {
+ MDString::get(context, "WI_counter"),
+ ConstantInt::get(Type::getInt32Ty(context), counter)};
+ MDNode* mdCounter = MDNode::get(context, v3);
+ counter++;
+ ii->setMetadata("wi", md);
+ ii->setMetadata("wi_counter", mdCounter);
+ }
+ }
+}
+
+
+/**
+ * Inserts a new basic block to the region, before an old basic block in
+ * the region.
+ *
+ * Assumes the inserted block to be before the other block in control
+ * flow, that is, there should be direct CFG edge from the block to the
+ * other.
+ */
+void
+ParallelRegion::AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before)
+{
+ llvm::BasicBlock *oldExit = exitBB();
+ ParallelRegion::iterator beforePos = find(begin(), end(), before);
+ ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit);
+ assert (beforePos != end());
+
+ /* The old exit node might is now pushed further, at most one position.
+ Whether this is the case, depends if the node was inserted before or
+ after that node in the vector. That is, if indexof(before) < indexof(oldExit). */
+ if (beforePos < oldExitPos) ++exitIndex_;
+
+ insert(beforePos, block);
+ /* The entryIndex_ should be still correct. In case the 'before' block
+ was an old entry node, the new one replaces it as an entry node at
+ the same index and the old one gets pushed forward. */
+}
+
+
+void
+ParallelRegion::AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after)
+{
+ llvm::BasicBlock *oldExit = exitBB();
+ ParallelRegion::iterator afterPos = find(begin(), end(), after);
+ ParallelRegion::iterator oldExitPos = find(begin(), end(), oldExit);
+ assert (afterPos != end());
+
+ /* The old exit node might be pushed further, at most one position.
+ Whether this is the case, depends if the node was inserted before or
+ after that node in the vector. That is, if indexof(before) < indexof(oldExit). */
+ if (afterPos < oldExitPos) ++exitIndex_;
+ afterPos++;
+ insert(afterPos, block);
+}
+
+bool
+ParallelRegion::HasBlock(llvm::BasicBlock *bb)
+{
+ return find(begin(), end(), bb) != end();
+}
+
+/**
+ * Find the instruction that loads the Z dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDZLoad()
+{
+ if (LocalIDZLoadInstr != NULL) return LocalIDZLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDZLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Z_GLOBAL));
+}
+
+/**
+ * Find the instruction that loads the Y dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDYLoad()
+{
+ if (LocalIDYLoadInstr != NULL) return LocalIDYLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDYLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_Y_GLOBAL));
+}
+
+/**
+ * Find the instruction that loads the X dimension of the work item
+ * in the beginning of the parallel region, if not found, creates it.
+ */
+llvm::Instruction*
+ParallelRegion::LocalIDXLoad()
+{
+ if (LocalIDXLoadInstr != NULL) return LocalIDXLoadInstr;
+ IRBuilder<> builder(entryBB()->getFirstInsertionPt());
+ return LocalIDXLoadInstr =
+ builder.CreateLoad
+ (entryBB()->getParent()->getParent()->getGlobalVariable(POCL_LOCAL_ID_X_GLOBAL));
+}
+
+void
+ParallelRegion::InjectPrintF
+(llvm::Instruction *before, std::string formatStr,
+ std::vector<Value*>& params)
+{
+ IRBuilder<> builder(before);
+ llvm::Module *M = before->getParent()->getParent()->getParent();
+
+ llvm::Value *stringArg =
+ builder.CreateGlobalString(formatStr);
+
+ /* generated with help from http://llvm.org/demo/index.cgi */
+ Function* printfFunc = M->getFunction("printf");
+ if (printfFunc == NULL) {
+ PointerType* PointerTy_4 = PointerType::get(IntegerType::get(M->getContext(), 8), 0);
+
+ std::vector<Type*> FuncTy_6_args;
+ FuncTy_6_args.push_back(PointerTy_4);
+
+ FunctionType* FuncTy_6 =
+ FunctionType::get
+ (/*Result=*/IntegerType::get(M->getContext(), 32),
+ /*Params=*/FuncTy_6_args,
+ /*isVarArg=*/true);
+
+ printfFunc =
+ Function::Create
+ (/*Type=*/FuncTy_6,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"printf", M);
+ printfFunc->setCallingConv(CallingConv::C);
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+ AttrListPtr func_printf_PAL;
+#else
+ AttributeSet func_printf_PAL;
+#endif
+ {
+#ifdef LLVM_3_1
+ SmallVector<AttributeWithIndex, 4> Attrs;
+ AttributeWithIndex PAWI;
+ PAWI.Index = 1U;
+ PAWI.Attrs = Attribute::NoCapture;
+ Attrs.push_back(PAWI);
+ PAWI.Index = 4294967295U;
+ PAWI.Attrs = Attribute::NoUnwind;
+ Attrs.push_back(PAWI);
+ func_printf_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());
+#elif defined LLVM_3_2
+ SmallVector<AttributeWithIndex, 4> Attrs;
+ Attrs.push_back(AttributeWithIndex::get(M->getContext(), 1U, Attributes::NoCapture));
+ Attrs.push_back(AttributeWithIndex::get(M->getContext(), 4294967295U, Attributes::NoUnwind));
+ func_printf_PAL = AttrListPtr::get(M->getContext(), Attrs);
+#else
+ func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture);
+ func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind);
+#endif
+ }
+ printfFunc->setAttributes(func_printf_PAL);
+ }
+
+ std::vector<Constant*> const_ptr_8_indices;
+
+ ConstantInt* const_int64_9 = ConstantInt::get(M->getContext(), APInt(64, StringRef("0"), 10));
+ const_ptr_8_indices.push_back(const_int64_9);
+ const_ptr_8_indices.push_back(const_int64_9);
+ assert (isa<Constant>(stringArg));
+ Constant* const_ptr_8 =
+ ConstantExpr::getGetElementPtr
+ (cast<Constant>(stringArg), const_ptr_8_indices);
+
+ std::vector<Value*> args;
+ args.push_back(const_ptr_8);
+ args.insert(args.end(), params.begin(), params.end());
+
+ CallInst::Create(printfFunc, args, "", before);
+}
+
+void
+ParallelRegion::SetExitBB(llvm::BasicBlock *block)
+{
+ for (size_t i = 0; i < size(); ++i)
+ {
+ if (at(i) == block)
+ {
+ setExitBBIndex(i);
+ return;
+ }
+ }
+ assert (false && "The block was not found in the PRegion!");
+}
+
+/**
+ * Adds a printf to the end of the parallel region that prints the
+ * region ID and the work item ID.
+ *
+ * Useful for debugging control flow bugs.
+ */
+void
+ParallelRegion::InjectRegionPrintF()
+{
+ llvm::Module *M = entryBB()->getParent()->getParent();
+
+#if 0
+ // it should reuse equal strings anyways
+ const char* FORMAT_STR_VAR = ".pocl.pRegion_debug_str";
+ llvm::Value *stringArg = M->getGlobalVariable(FORMAT_STR_VAR);
+ if (stringArg == NULL)
+ {
+ IRBuilder<> builder(entryBB());
+ stringArg = builder.CreateGlobalString("PR %d WI %u %u %u\n", FORMAT_STR_VAR);
+ }
+#endif
+
+ ConstantInt* pRID = ConstantInt::get(M->getContext(), APInt(32, pRegionId, 10));
+ std::vector<Value*> params;
+ params.push_back(pRID);
+ params.push_back(LocalIDXLoad());
+ params.push_back(LocalIDYLoad());
+ params.push_back(LocalIDZLoad());
+
+ InjectPrintF(exitBB()->getTerminator(), "PR %d WI %u %u %u\n", params);
+
+}
+
+/**
+ * Adds a printf to the end of the parallel region that prints the
+ * hex contents of all named non-pointer variables.
+ *
+ * Useful for debugging data flow bugs.
+ */
+void
+ParallelRegion::InjectVariablePrintouts()
+{
+ for (ParallelRegion::iterator i = begin();
+ i != end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+ if (isa<PointerType>(instruction->getType()) ||
+ !instruction->hasName()) continue;
+ std::string name = instruction->getName().str();
+ std::vector<Value*> args;
+ IRBuilder<> builder(exitBB()->getTerminator());
+ args.push_back(builder.CreateGlobalString(name));
+ args.push_back(instruction);
+ InjectPrintF(instruction->getParent()->getTerminator(), "variable %s == %x\n", args);
+ }
+ }
+}
+
+/**
+ * Localizes all the loads to the the work-item identifiers.
+ *
+ * In case the code inside the region queries the WI id, it
+ * should not (re)use one that is loaded in another region, but
+ * one that is loaded in the same region. Otherwise, it ends
+ * up using the last id the previous PR work-item loop got.
+ * This caused problems in cases where the local id was stored
+ * to a temporary variable in an earlier region and that temp
+ * was reused later.
+ *
+ * The function scans for all loads from the local id variables
+ * and converts them to loads inside the parallel region.
+ */
+void
+ParallelRegion::LocalizeIDLoads()
+{
+ /* The local id loads inside the parallel region. */
+ llvm::Instruction* LocalIDXLoadInstr = LocalIDXLoad();
+ llvm::Instruction* LocalIDYLoadInstr = LocalIDYLoad();
+ llvm::Instruction* LocalIDZLoadInstr = LocalIDZLoad();
+ llvm::Module *M = LocalIDXLoadInstr->getParent()->getParent()->getParent();
+ llvm::Value *localIdZ = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL);
+ llvm::Value *localIdY = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL);
+ llvm::Value *localIdX = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL);
+
+ assert (localIdZ != NULL && localIdY != NULL && localIdX != NULL &&
+ "The local id globals were not created.");
+
+ for (ParallelRegion::iterator i = begin();
+ i != end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instrI = bb->begin();
+ instrI != bb->end(); ++instrI)
+ {
+ llvm::Instruction *instr = instrI;
+ if (instr == LocalIDXLoadInstr ||
+ instr == LocalIDYLoadInstr ||
+ instr == LocalIDZLoadInstr) continue;
+
+ /* Search all operands of the instruction. If any of them is
+ using a local id, replace it with the intra-PR load from the
+ id variable. */
+ for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr)
+ {
+ llvm::LoadInst *load =
+ dyn_cast<llvm::LoadInst>(instr->getOperand(opr));
+ if (load == NULL) continue;
+ if (load == LocalIDXLoadInstr ||
+ load == LocalIDYLoadInstr ||
+ load == LocalIDZLoadInstr) continue;
+
+ if (load->getPointerOperand() == localIdZ)
+ instr->setOperand(opr, LocalIDZLoadInstr);
+ if (load->getPointerOperand() == localIdY)
+ instr->setOperand(opr, LocalIDYLoadInstr);
+ if (load->getPointerOperand() == localIdX)
+ instr->setOperand(opr, LocalIDXLoadInstr);
+ }
+ }
+ }
+}
diff --git a/src/llvmopencl/ParallelRegion.h b/src/llvmopencl/ParallelRegion.h
new file mode 100644
index 0000000..9313983
--- /dev/null
+++ b/src/llvmopencl/ParallelRegion.h
@@ -0,0 +1,127 @@
+// Class definition for parallel regions, a group of BasicBlocks that
+// each kernel should run in parallel.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_PARALLEL_REGION_H
+#define _POCL_PARALLEL_REGION_H
+
+#include "BarrierBlock.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/BasicBlock.h"
+#include "llvm/LLVMContext.h"
+#else
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/LLVMContext.h"
+#endif
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/SmallVector.h"
+#include <vector>
+
+namespace pocl {
+
+#define POCL_LOCAL_ID_X_GLOBAL "_local_id_x"
+#define POCL_LOCAL_ID_Y_GLOBAL "_local_id_y"
+#define POCL_LOCAL_ID_Z_GLOBAL "_local_id_z"
+
+class Kernel;
+
+ // TODO Cleanup: this should not inherit vector but contain it.
+ // It now exposes too much to the clients and leads to hard
+ // to track errors when the API is changed.
+ class ParallelRegion : public std::vector<llvm::BasicBlock *> {
+ public:
+ typedef llvm::SmallVector<ParallelRegion *, 8> ParallelRegionVector;
+
+ ParallelRegion(int forcedRegionId=-1);
+
+ /* BarrierBlock *getEntryBarrier(); */
+ ParallelRegion *replicate(llvm::ValueToValueMapTy &map,
+ const llvm::Twine &suffix);
+ void remap(llvm::ValueToValueMapTy &map);
+ void purge();
+ void chainAfter(ParallelRegion *region);
+ void insertPrologue(unsigned x, unsigned y, unsigned z);
+ static void insertLocalIdInit(llvm::BasicBlock* entry,
+ unsigned x,
+ unsigned y,
+ unsigned z);
+ void dump();
+ void dumpNames();
+ void setEntryBBIndex(std::size_t index) { entryIndex_ = index; }
+ void setExitBBIndex(std::size_t index) { exitIndex_ = index; }
+ void SetExitBB(llvm::BasicBlock *block);
+ void AddBlockBefore(llvm::BasicBlock *block, llvm::BasicBlock *before);
+ void AddBlockAfter(llvm::BasicBlock *block, llvm::BasicBlock *after);
+
+ llvm::BasicBlock* exitBB() { return at(exitIndex_); }
+ llvm::BasicBlock* entryBB() { return at(entryIndex_); }
+ void AddIDMetadata(llvm::LLVMContext& context,
+ std::size_t x = 0,
+ std::size_t y = 0,
+ std::size_t z = 0);
+
+ void AddParallelLoopMetadata(llvm::MDNode *identifier);
+
+ bool HasBlock(llvm::BasicBlock *bb);
+
+ void InjectRegionPrintF();
+ void InjectVariablePrintouts();
+
+ void InjectPrintF
+ (llvm::Instruction *before, std::string formatStr,
+ std::vector<llvm::Value*>& params);
+
+ static ParallelRegion *
+ Create(const llvm::SmallPtrSet<llvm::BasicBlock *, 8>& bbs,
+ llvm::BasicBlock *entry, llvm::BasicBlock *exit);
+
+ static void GenerateTempNames(llvm::BasicBlock *bb);
+
+ llvm::Instruction* LocalIDXLoad();
+ llvm::Instruction* LocalIDYLoad();
+ llvm::Instruction* LocalIDZLoad();
+
+ void LocalizeIDLoads();
+
+ private:
+ llvm::Instruction* LocalIDXLoadInstr;
+ llvm::Instruction* LocalIDYLoadInstr;
+ llvm::Instruction* LocalIDZLoadInstr;
+
+ bool Verify();
+ /// The indices of entry and exit, not pointers, for finding the BBs in the
+ /// replicated PRs too.
+ std::size_t exitIndex_;
+ std::size_t entryIndex_;
+
+ /// Identifier for the parallel region.
+ int pRegionId;
+ static int idGen;
+
+ };
+
+}
+
+#endif
diff --git a/src/llvmopencl/TargetAddressSpaces.cc b/src/llvmopencl/TargetAddressSpaces.cc
new file mode 100644
index 0000000..bd860cc
--- /dev/null
+++ b/src/llvmopencl/TargetAddressSpaces.cc
@@ -0,0 +1,220 @@
+// Header for TargetAddressSpaces
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <iostream>
+#include <string>
+
+#ifdef LLVM_3_2
+# include <llvm/Instructions.h>
+#else
+# include <llvm/IR/Instructions.h>
+# include <llvm/IR/Module.h>
+
+#endif
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "TargetAddressSpaces.h"
+#include "Workgroup.h"
+#include "LLVMUtils.h"
+#include "pocl.h"
+
+#define DEBUG_TARGET_ADDRESS_SPACES
+
+namespace pocl {
+
+using namespace llvm;
+
+namespace {
+ static
+ RegisterPass<pocl::TargetAddressSpaces> X
+ ("target-address-spaces",
+ "Convert the 'fake' address space ids to the target specific ones.");
+}
+
+char TargetAddressSpaces::ID = 0;
+
+TargetAddressSpaces::TargetAddressSpaces() : ModulePass(ID) {
+}
+
+static Type *
+ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap) {
+
+ if (type->isPointerTy()) {
+ unsigned AS = type->getPointerAddressSpace();
+ unsigned newAS = addrSpaceMap[AS];
+ return PointerType::get(ConvertedType(type->getPointerElementType(), addrSpaceMap), newAS);
+ } else if (type->isArrayTy()) {
+ return ArrayType::get
+ (ConvertedType(type->getArrayElementType(), addrSpaceMap), type->getArrayNumElements());
+ } else { /* TODO: pointers inside structs */
+ return type;
+ }
+}
+
+static bool
+UpdateAddressSpace(llvm::Value& val, std::map<unsigned, unsigned> &addrSpaceMap) {
+ Type *type = val.getType();
+ if (!type->isPointerTy()) return false;
+
+ Type *newType = ConvertedType(type, addrSpaceMap);
+ if (newType == type) return false;
+
+ val.mutateType(newType);
+ return true;
+}
+
+
+bool
+TargetAddressSpaces::runOnModule(llvm::Module &M) {
+
+ std::string triple = M.getTargetTriple();
+ std::string arch = triple;
+ size_t dash = triple.find("-");
+ if (dash != std::string::npos) {
+ arch = triple.substr(0, dash);
+ }
+
+ std::map<unsigned, unsigned> addrSpaceMap;
+
+ if (arch == "x86_64") {
+ /* For x86_64 the default isel seems to work with the
+ fake address spaces. Skip the processing as it causes
+ an overhead and is not fully implemented.
+ */
+ return false;
+ } else if (arch == "tce") {
+ /* TCE requires the remapping. */
+ addrSpaceMap[POCL_ADDRESS_SPACE_GLOBAL] = 3;
+ addrSpaceMap[POCL_ADDRESS_SPACE_LOCAL] = 4;
+ /* LLVM 3.2 detects 'constant' as cuda_constant (5) in the fake
+ address space map. Add it for compatibility. */
+ addrSpaceMap[5] = addrSpaceMap[POCL_ADDRESS_SPACE_CONSTANT] = 5;
+
+ } else {
+ /* Assume the fake address space map works directly in case not
+ overridden here. */
+ return false;
+ }
+
+ bool changed = false;
+ /* Handle global variables. */
+ llvm::Module::global_iterator globalI = M.global_begin();
+ llvm::Module::global_iterator globalE = M.global_end();
+ for (; globalI != globalE; ++globalI) {
+ llvm::Value &global = *globalI;
+ changed |= UpdateAddressSpace(global, addrSpaceMap);
+ }
+
+ FunctionMapping funcReplacements;
+ std::vector<llvm::Function*> unhandledFuncs;
+
+ /* Collect the functions to process first because we add
+ a new function per modified function which invalidates
+ the Module's function iterator. */
+ for (llvm::Module::iterator functionI = M.begin(), functionE = M.end();
+ functionI != functionE; ++functionI) {
+ if (functionI->empty() || functionI->getName().startswith("_GLOBAL"))
+ continue;
+ unhandledFuncs.push_back(functionI);
+ }
+
+ for (std::vector<llvm::Function*>::iterator i = unhandledFuncs.begin(),
+ e = unhandledFuncs.end(); i != e; ++i) {
+ llvm::Function &F = **i;
+
+ /* Convert the FunctionType. Because there is no mutator API in
+ LLVM for this, we need to recreate the whole darn function :( */
+ SmallVector<Type *, 8> parameters;
+ for (Function::const_arg_iterator i = F.arg_begin(),
+ e = F.arg_end();
+ i != e; ++i)
+ parameters.push_back(ConvertedType(i->getType(), addrSpaceMap));
+
+ llvm::FunctionType *ft = FunctionType::get
+ (ConvertedType(F.getReturnType(), addrSpaceMap),
+ parameters, F.isVarArg());
+
+ llvm::Function *newFunc = Function::Create(ft, F.getLinkage(), "", &M);
+ newFunc->takeName(&F);
+
+ ValueToValueMapTy vv;
+ Function::arg_iterator j = newFunc->arg_begin();
+ for (Function::const_arg_iterator i = F.arg_begin(),
+ e = F.arg_end();
+ i != e; ++i) {
+ j->setName(i->getName());
+ vv[i] = j;
+ ++j;
+ }
+
+ SmallVector<ReturnInst *, 1> ri;
+
+ class AddressSpaceReMapper : public ValueMapTypeRemapper {
+ public:
+ AddressSpaceReMapper(std::map<unsigned, unsigned> &addrSpaceMap) :
+ addrSpaceMap_(addrSpaceMap) {}
+ Type* remapType(Type *type) {
+ Type *newType = ConvertedType(type, addrSpaceMap_);
+ if (newType == type) return type;
+ return newType;
+ }
+ private:
+ std::map<unsigned, unsigned>& addrSpaceMap_;
+ } asvtm(addrSpaceMap);
+
+ CloneFunctionInto(newFunc, &F, vv, true, ri, "", NULL, &asvtm);
+ funcReplacements[&F] = newFunc;
+ }
+
+ /* Replace all references to the old function to the new one. */
+ llvm::Module::iterator fI = M.begin();
+ llvm::Module::iterator fE = M.end();
+ for (; fI != fE; ++fI) {
+ llvm::Function &F = *fI;
+ for (llvm::Function::iterator bbi = F.begin(), bbe = F.end(); bbi != bbe;
+ ++bbi)
+ for (llvm::BasicBlock::iterator ii = bbi->begin(), ie = bbi->end(); ii != ie;
+ ++ii) {
+ llvm::Instruction *instr = ii;
+ if (!isa<CallInst>(instr)) continue;
+ llvm::CallInst *call = dyn_cast<CallInst>(instr);
+ llvm::Function *calledF = call->getCalledFunction();
+ if (funcReplacements.find(calledF) == funcReplacements.end()) continue;
+
+ call->setCalledFunction(funcReplacements[calledF]);
+ }
+ }
+
+ regenerate_kernel_metadata(M, funcReplacements);
+
+ /* Delete the old functions. */
+ for (FunctionMapping::iterator i = funcReplacements.begin(),
+ e = funcReplacements.end(); i != e; ++i) {
+ i->first->eraseFromParent();
+ }
+
+ return true;
+}
+
+}
diff --git a/src/llvmopencl/TargetAddressSpaces.h b/src/llvmopencl/TargetAddressSpaces.h
new file mode 100644
index 0000000..1a080c8
--- /dev/null
+++ b/src/llvmopencl/TargetAddressSpaces.h
@@ -0,0 +1,54 @@
+// Header for TargetAddressSpaces, an LLVM pass that converts the
+// generic address space ids to the target specific ones.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_TARGET_ADDRESS_SPACES_H
+#define _POCL_TARGET_ADDRESS_SPACES_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+
+namespace pocl {
+ /* pocl uses the fixed address space ids forced by the clang's
+ -ffake-address-space-map internally until the end to be able to
+ detect the different OpenCL address spaces ambiguously, regardless
+ of the target. This pass converts the fake address space ids to
+ the target-specific ones, if required by the code generator of that
+ target. */
+ class TargetAddressSpaces : public llvm::ModulePass {
+ public:
+ static char ID;
+
+ TargetAddressSpaces();
+ virtual ~TargetAddressSpaces() {};
+
+ virtual bool runOnModule(llvm::Module &M);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/VariableUniformityAnalysis.cc b/src/llvmopencl/VariableUniformityAnalysis.cc
new file mode 100644
index 0000000..4362524
--- /dev/null
+++ b/src/llvmopencl/VariableUniformityAnalysis.cc
@@ -0,0 +1,382 @@
+// Implementation for VariableUniformityAnalysis function pass.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#ifdef LLVM_3_2
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/ValueSymbolTable.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/DataLayout.h"
+#endif
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/PostDominators.h"
+
+#include "WorkitemHandler.h"
+#include "Kernel.h"
+#include "VariableUniformityAnalysis.h"
+#include "Barrier.h"
+
+//#define DEBUG_UNIFORMITY_ANALYSIS
+
+namespace pocl {
+
+char VariableUniformityAnalysis::ID = 0;
+
+using namespace llvm;
+
+static
+RegisterPass<VariableUniformityAnalysis> X(
+ "uniformity",
+ "Analyses the variables of the function for uniformity (same value across WIs).",
+ false, false);
+
+VariableUniformityAnalysis::VariableUniformityAnalysis() : FunctionPass(ID) {
+}
+
+
+void
+VariableUniformityAnalysis::getAnalysisUsage(llvm::AnalysisUsage &AU) const {
+ AU.addRequired<PostDominatorTree>();
+ AU.addPreserved<PostDominatorTree>();
+ AU.addRequired<LoopInfo>();
+ AU.addPreserved<LoopInfo>();
+ // required by LoopInfo:
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<DominatorTree>();
+
+// TODO This was turned off because of compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+ AU.addPreserved<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+ AU.addPreserved<DataLayout>();
+#endif
+#endif
+}
+
+bool
+VariableUniformityAnalysis::runOnFunction(Function &F) {
+
+ /* Do the actual analysis on-demand except for the basic block
+ divergence analysis. */
+ uniformityCache_[&F].clear();
+
+ /* Mark the canonican induction variable PHI as uniform.
+ If there's a canonical induction variable in loops, the variable
+ update for each iteration should be uniform. Note: this does not yet imply
+ all the work-items execute the loop same number of times! */
+ llvm::LoopInfo &LI = getAnalysis<LoopInfo>();
+ for (llvm::LoopInfo::iterator i = LI.begin(), e = LI.end(); i != e; ++i) {
+ llvm::Loop *L = *i;
+ if (llvm::PHINode *inductionVar = L->getCanonicalInductionVariable()) {
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### canonical induction variable, assuming uniform:";
+ inductionVar->dump();
+#endif
+ setUniform(&F, inductionVar);
+ }
+ }
+
+ setUniform(&F, &F.getEntryBlock());
+ analyzeBBDivergence(&F, &F.getEntryBlock(), &F.getEntryBlock());
+ // F.viewCFG();
+ return false;
+}
+
+/**
+ * BB divergence analysis.
+ *
+ * Define:
+ * Uniform BB. A basic block which is known to be executed by all or none
+ * of the work-items, that is, a BB where it's known safe to add a barrier.
+ *
+ * Divergent/varying BB. A basic block where work-items *might* diverge.
+ * That is, it cannot be proven that all work-items execute the BB.
+ *
+ * Propagate the information from the entry downwards (breadth first).
+ * This avoids infinite recursion with loop back edges and enables
+ * to keep book of the "last seen" uniform BB.
+ *
+ * The conditions to mark a BB 'uniform':
+ *
+ * a) the function entry
+ * b) BBs that post-dominate at least one uniform BB (try the previously
+ * found one)
+ * c) BBs that are branched to directly from a uniform BB using a uniform branch.
+ *
+ * Otherwise, assume divergent (might not be *proven* to be one!).
+ *
+ */
+void
+VariableUniformityAnalysis::analyzeBBDivergence
+(llvm::Function *f, llvm::BasicBlock *bb, llvm::BasicBlock *previousUniformBB) {
+
+
+ llvm::BasicBlock *newPreviousUniformBB = previousUniformBB;
+
+ llvm::BranchInst *br =
+ dyn_cast<llvm::BranchInst>(previousUniformBB->getTerminator());
+
+ if (br == NULL) {
+ // this is most likely a function with a single basic block, the entry node, which
+ // ends with a ret
+ return;
+ }
+
+ // Condition c)
+ if ((!br->isConditional() || isUniform(f, br->getCondition()))) {
+ for (unsigned suc = 0, end = br->getNumSuccessors(); suc < end; ++suc) {
+ if (br->getSuccessor(suc) == bb) {
+ setUniform(f, bb, true);
+ newPreviousUniformBB = bb;
+ break;
+ }
+ }
+ }
+
+ // Condition b)
+ if (newPreviousUniformBB != bb) {
+ llvm::PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
+ if (PDT->dominates(bb, previousUniformBB)) {
+ setUniform(f, bb, true);
+ newPreviousUniformBB = bb;
+ }
+ }
+
+ /* Assume diverging. */
+ if (!isUniformityAnalyzed(f, bb))
+ setUniform(f, bb, false);
+
+ llvm::BranchInst *nextbr = dyn_cast<llvm::BranchInst>(bb->getTerminator());
+
+ if (nextbr == NULL) return; /* ret */
+
+ /* Propagate the data downward. */
+ for (unsigned suc = 0, end = nextbr->getNumSuccessors(); suc < end; ++suc) {
+ llvm::BasicBlock *nextbb = nextbr->getSuccessor(suc);
+ if (!isUniformityAnalyzed(f, nextbb)) {
+ analyzeBBDivergence(f, nextbb, newPreviousUniformBB);
+ }
+ }
+}
+
+bool
+VariableUniformityAnalysis::isUniformityAnalyzed(llvm::Function *f, llvm::Value *v) const {
+ UniformityIndex &cache = uniformityCache_[f];
+ UniformityIndex::const_iterator i = cache.find(v);
+ if (i != cache.end()) {
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Simple uniformity analysis that recursively analyses all the
+ * operands affecting the value.
+ *
+ * Known uniform Values:
+ * a) kernel arguments
+ * b) constants
+ *
+ */
+bool
+VariableUniformityAnalysis::isUniform(llvm::Function *f, llvm::Value* v) {
+
+ UniformityIndex &cache = uniformityCache_[f];
+ UniformityIndex::const_iterator i = cache.find(v);
+ if (i != cache.end()) {
+ return (*i).second;
+ }
+
+ if (llvm::BasicBlock *bb = dyn_cast<llvm::BasicBlock>(v)) {
+ if (bb == &f->getEntryBlock()) {
+ setUniform(f, v, true);
+ return true;
+ }
+ }
+
+ if (isa<llvm::Argument>(v)) {
+ setUniform(f, v, true);
+ return true;
+ }
+
+ if (isa<llvm::ConstantInt>(v)) {
+ setUniform(f, v, true);
+ return true;
+ }
+
+ if (isa<llvm::AllocaInst>(v)) {
+ /* Allocas might or might not be divergent. These are produced
+ from work-item private arrays or the PHIsToAllocas. It depends
+ what is written to them whether they are really divergent.
+
+ We need to figure out if any of the stores to the alloca contain
+ work-item id dependent data. Take a white listing approach that
+ detects the ex-phi allocas of loop iteration variables of non-diverging
+ loops.
+
+ Currently the following case is white listed:
+ a) are scalars
+ b) are accesses only with load and stores (e.g. address not taken)
+ c) stored data is uniform
+
+ Because alloca data can be modified in loops and thus be dependent on
+ itself, we need a bit involved mechanism to handle it. First create
+ a copy of the uniformity cache, then assume the alloca itself is uniform,
+ then check if all the stores to the alloca contain uniform data. If
+ our initial assumption was wrong, restore the cache from the backup.
+ */
+ UniformityCache backupCache(uniformityCache_);
+ setUniform(f, v);
+
+ bool isUniformAlloca = true;
+ llvm::Instruction *instruction = dyn_cast<llvm::AllocaInst>(v);
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui) {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+
+ llvm::StoreInst *store = dyn_cast<llvm::StoreInst>(user);
+ if (store) {
+ if (!isUniform(f, store->getValueOperand())) {
+ isUniformAlloca = false;
+ break;
+ }
+ } else if (dyn_cast<llvm::LoadInst>(user) != NULL) {
+ } else {
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### alloca has a suspicious user" << std::endl;
+ user->dump();
+#endif
+ isUniformAlloca = false;
+ break;
+ }
+ }
+
+ if (!isUniformAlloca) {
+ // restore the old uniform data as our guess was wrong
+ uniformityCache_ = backupCache;
+ }
+ setUniform(f, v, isUniformAlloca);
+
+ return isUniformAlloca;
+ }
+
+ /* TODO: global memory loads are uniform in case they are accessing
+ the higher scope ids (group_id_?). */
+ if (isa<llvm::LoadInst>(v)) {
+ llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(v);
+ llvm::Value *pointer = load->getPointerOperand();
+ llvm::Module *M = load->getParent()->getParent()->getParent();
+
+ if (pointer == M->getGlobalVariable("_group_id_x") ||
+ pointer == M->getGlobalVariable("_group_id_y") ||
+ pointer == M->getGlobalVariable("_group_id_z") ||
+ pointer == M->getGlobalVariable("_work_dim") ||
+ pointer == M->getGlobalVariable("_num_groups_x") ||
+ pointer == M->getGlobalVariable("_num_groups_y") ||
+ pointer == M->getGlobalVariable("_num_groups_z") ||
+ pointer == M->getGlobalVariable("_global_offset_x") ||
+ pointer == M->getGlobalVariable("_global_offset_y") ||
+ pointer == M->getGlobalVariable("_global_offset_z") ||
+ pointer == M->getGlobalVariable("_local_size_x") ||
+ pointer == M->getGlobalVariable("_local_size_y") ||
+ pointer == M->getGlobalVariable("_local_size_z")) {
+
+ setUniform(f, v, true);
+ return true;
+ }
+ }
+
+ if (isa<llvm::PHINode>(v)) {
+ /* TODO: PHINodes need control flow analysis:
+ even if the values are uniform, the selected
+ value depends on the preceeding basic block which
+ might depend on the ID. Assume they are not uniform
+ for now in general and treat the loop iteration
+ variable as a special case (set externally from a LoopPass).
+
+ TODO: PHINodes can depend (indirectly or directly) on itself in loops
+ so it would need infinite recursion checking.
+ */
+ setUniform(f, v, false);
+ return false;
+ }
+
+ llvm::Instruction *instr = dyn_cast<llvm::Instruction>(v);
+ if (instr == NULL) {
+ setUniform(f, v, false);
+ return false;
+ }
+ // not computed previously, scan all operands of the instruction
+ // and figure out their uniformity recursively
+ for (unsigned opr = 0; opr < instr->getNumOperands(); ++opr) {
+ llvm::Value *operand = instr->getOperand(opr);
+ if (!isUniform(f, operand)) {
+ setUniform(f, v, false);
+ return false;
+ }
+ }
+ setUniform(f, v, true);
+ return true;
+}
+
+void
+VariableUniformityAnalysis::setUniform(llvm::Function *f,
+ llvm::Value *v,
+ bool isUniform) {
+
+ UniformityIndex &cache = uniformityCache_[f];
+ cache[v] = isUniform;
+
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+ std::cerr << "### ";
+ if (isUniform)
+ std::cerr << "uniform ";
+ else
+ std::cerr << "varying ";
+
+ if (isa<llvm::BasicBlock>(v)) {
+ std::cerr << "BB: " << v->getName().str() << std::endl;
+ } else {
+ v->dump();
+ }
+#endif
+}
+
+}
diff --git a/src/llvmopencl/VariableUniformityAnalysis.h b/src/llvmopencl/VariableUniformityAnalysis.h
new file mode 100644
index 0000000..88175a8
--- /dev/null
+++ b/src/llvmopencl/VariableUniformityAnalysis.h
@@ -0,0 +1,70 @@
+// Header for VariableUniformityAnalysis function pass.
+//
+// Copyright (c) 2013 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef POCL_VARIABLE_UNIFORMITY_ANALYSIS_H
+#define POCL_VARIABLE_UNIFORMITY_ANALYSIS_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+
+namespace pocl {
+ /**
+ * Analyses the variables in the function to figure out if a variable
+ * value is
+ *
+ * a) 'uniform', i.e., always same for all work-items in the *same work-group*
+ * b) 'varying', i.e., somehow dependent on the work-item id
+ *
+ * For safety, 'variable' is assumed, unless certain of a).
+ */
+ class VariableUniformityAnalysis : public llvm::FunctionPass {
+ public:
+ static char ID;
+
+ VariableUniformityAnalysis();
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+ virtual bool isUniform(llvm::Function *f, llvm::Value* v);
+ virtual void setUniform(llvm::Function *f, llvm::Value *v, bool isUniform=true);
+ virtual void analyzeBBDivergence(llvm::Function *f,
+ llvm::BasicBlock *bb,
+ llvm::BasicBlock *previousUniformBB);
+
+ private:
+
+ bool isUniformityAnalyzed(llvm::Function *f, llvm::Value *val) const;
+
+ typedef std::map<llvm::Value*, bool> UniformityIndex;
+ typedef std::map<llvm::Function *, UniformityIndex> UniformityCache;
+ mutable UniformityCache uniformityCache_;
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WIVectorize.cc b/src/llvmopencl/WIVectorize.cc
new file mode 100644
index 0000000..e234392
--- /dev/null
+++ b/src/llvmopencl/WIVectorize.cc
@@ -0,0 +1,3252 @@
+//===- WIVectorize.cpp - A Work Item Vectorizer -------------------------===//
+//
+// This code has been adapted from BBVectorize of the LLVM project.
+// The original file comment:
+//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+//
+// WIVectorize:
+//
+// Additional options are provided to vectorize only candidate from differnt
+// work items according to metadata provided by 'pocl' frontend
+// (launchpad.net/pocl).
+//
+// Additional option is also available to vectorize loads and stores only.
+// Still work in progress by vladimir guzma [at] tut fi.
+//
+//===----------------------------------------------------------------------===//
+
+#define WIV_NAME "wi-vectorize"
+#define DEBUG_TYPE WIV_NAME
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Type.h"
+#include "llvm/Metadata.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Type.h"
+#include "llvm/Metadata.h"
+#include "llvm/TargetTransformInfo.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#endif
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+#include <iostream>
+using namespace llvm;
+
+static cl::opt<bool>
+IgnoreTargetInfo("wi-vectorize-ignore-target-info", cl::init(true),
+ cl::Hidden, cl::desc("Ignore target information"));
+
+static cl::opt<unsigned>
+ReqChainDepth("wi-vectorize-req-chain-depth", cl::init(3), cl::Hidden,
+ cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<unsigned>
+VectorWidth("wi-vectorize-vector-width", cl::init(8), cl::Hidden,
+ cl::desc("The width of the machine vector in words."));
+
+static cl::opt<bool>
+NoMath("wi-vectorize-no-math", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("wi-vectorize-no-fma", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoMemOps("wi-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("wi-vectorize-aligned-only", cl::init(false), cl::Hidden,
+ cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+MemOpsOnly("wi-vectorize-mem-ops-only", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize loads and stores only"));
+
+static cl::opt<bool>
+NoFP("wi-vectorize-no-fp", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point operations"));
+
+static cl::opt<bool>
+NoCMP("wi-vectorize-no-cmp", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize comparison operations"));
+
+static cl::opt<bool>
+NoCount("wi-vectorize-no-counters", cl::init(false), cl::Hidden,
+ cl::desc("Forbid vectorization based no loop counter "
+ "arithmetic"));
+static cl::opt<bool>
+NoGEP("wi-vectorize-no-GEP", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize getelementpointer operations"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("wi-vectorize-debug-instruction-examination",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("wi-vectorize-debug-candidate-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("wi-vectorize-debug-pair-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("wi-vectorize-debug-cycle-check",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " cycle-checking process"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by wi-vectorize");
+
+namespace llvm {
+ FunctionPass* createWIVectorizePass();
+}
+namespace {
+ struct WIVectorize : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ WIVectorize() : FunctionPass(ID) {}
+
+ typedef std::pair<Value *, Value *> ValuePair;
+ typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+ typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+ typedef std::pair<std::multimap<Value *, Value *>::iterator,
+ std::multimap<Value *, Value *>::iterator> VPIteratorPair;
+ typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
+ std::multimap<ValuePair, ValuePair>::iterator>
+ VPPIteratorPair;
+ typedef std::vector<Value *> ValueVector;
+ typedef DenseMap<Value*, ValueVector*> ValueVectorMap;
+
+ AliasAnalysis *AA;
+ ScalarEvolution *SE;
+#ifdef LLVM_3_1
+ TargetData *TD;
+#elif defined LLVM_3_2
+ DataLayout *TD;
+ TargetTransformInfo *TTI;
+ const VectorTargetTransformInfo *VTTI;
+#else
+ DataLayout *TD;
+ TargetTransformInfo *TTI;
+ const TargetTransformInfo *VTTI;
+#endif
+ DenseMap<Value*, Value*> storedSources;
+ DenseMap<std::pair<int,int>, ValueVector*> stridedOps;
+ std::multimap<Value*, Value*> flippedStoredSources;
+ // FIXME: const correct?
+
+ bool vectorizePairs(BasicBlock &BB);
+
+ bool vectorizePhiNodes(BasicBlock &BB);
+
+ bool vectorizeAllocas(BasicBlock& BB);
+
+ void replaceUses(BasicBlock& BB,
+ AllocaInst& oldAlloca,
+ AllocaInst& newAlloca,
+ int indx);
+
+ Type* newAllocaType(Type* start, unsigned int width);
+
+ bool removeDuplicates(BasicBlock &BB);
+
+ void dropUnused(BasicBlock& BB);
+
+ bool getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts);
+
+ bool getCandidateAllocas(BasicBlock &BB,
+ std::multimap<int, ValueVector *>& candidateAllocas);
+
+ void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+
+ void buildDepMap(BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers);
+
+ void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ void fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+ bool areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore);
+
+ bool areInstsCompatibleFromDifferentWi(Instruction *I, Instruction *J);
+
+ bool trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers = true,
+ std::multimap<Value *, Value *> *LoadMoveSet = 0);
+
+ void computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P);
+
+ bool pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+
+ bool pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUsers,
+ DenseSet<ValuePair> &CurrentPairs);
+
+ void pruneTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree,
+ DenseSet<ValuePair> &PrunedTree, ValuePair J,
+ bool UseCycleCheck);
+
+ void buildInitialTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree, ValuePair J);
+
+ void findBestTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+ size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ bool UseCycleCheck);
+
+ Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs);
+
+ void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+ unsigned IdxOffset, std::vector<Constant*> &Mask);
+
+ Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+ Instruction *J);
+
+ Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs);
+
+ Value* CommonShuffleSource(Instruction *I, Instruction *J);
+
+ void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+ bool FlipMemInputs);
+
+ void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt, Instruction *&K1,
+ Instruction *&K2, bool FlipMemInputs);
+
+ void collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I);
+
+ void collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet);
+
+ void moveUsesOfIAfterJ(BasicBlock &BB,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J);
+
+ void collectPtrInfo(std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<Value *> &LowPtrInsts);
+
+ bool doInitialization(Module& /*m*/) {
+ return false;
+ }
+ bool doFinalization(Module& /*m*/) {
+ return false;
+ }
+ virtual bool runOnFunction(Function &Func) {
+
+ AA = &getAnalysis<AliasAnalysis>();
+ SE = &getAnalysis<ScalarEvolution>();
+#ifdef LLVM_3_1
+ TD = getAnalysisIfAvailable<TargetData>();
+#elif defined LLVM_3_2
+ TD = getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
+#else
+ TD = getAnalysisIfAvailable<DataLayout>();
+ TTI = IgnoreTargetInfo ? 0 :
+ getAnalysisIfAvailable<TargetTransformInfo>();
+ VTTI = TTI;
+#endif
+
+ bool changed = false;
+ for (Function::iterator i = Func.begin();
+ i != Func.end(); i++) {
+ changed |=runOnBasicBlock(*i);
+ }
+ return changed;
+ }
+
+ virtual bool runOnBasicBlock(BasicBlock &BB) {
+
+ bool changed = false;
+
+ // First try to create vectors of all allocas, if there are any
+ changed |= vectorizeAllocas(BB);
+ // Iterate a sufficient number of times to merge types of size 1 bit,
+ // then 2 bits, then 4, etc. up to half of the target vector width of the
+ // target vector register.
+ bool vectorizeTwice = false;
+
+
+ // There are 3 possible cases of vectorization in regards to memory
+ // operations:
+ // 1: Explicitly forbid vectorization of mem ops (NoMemOps)
+ // 2: Allow only vectorization of mem ops (MemOpsOnly)
+ // 3: Vectorize mem ops as well as everything else
+ // In cases 1 and 2, following test makes sure vectorization is
+ // run only once.
+ // For case 3, we first run vectorization of memory operations only
+ // and then we run vectorization of everything else. In between
+ // we remove unused operations, which are typicaly memory
+ // access computations that are not needed anymore and their vectorization
+ // is waste of resources. Instruction combiner is not able to get rid
+ // of those on it's own once they are in vectors.
+
+ // Store original values of two variables. They can be changed bellow
+ // but have to be restored before calling this for next BB.
+ bool originalMemOpsOnly = MemOpsOnly;
+ bool originalNoMemOps = NoMemOps;
+ if (!MemOpsOnly && !NoMemOps) {
+ MemOpsOnly = true;
+ vectorizeTwice = true;
+ }
+#if 0
+#ifdef LLVM_3_3
+ if (TTI) {
+ std::cerr << " settign new vector width" << std::endl;
+ unsigned WidestRegister = TTI->getRegisterBitWidth(true);
+ VectorWidth = WidestRegister/32;
+ std::cerr << VectorWidth << std::endl;
+ }
+#endif
+#endif
+
+ for (unsigned v = 2, n = 1; v <= VectorWidth;
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "WIV: fusing memm only in loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB)) {
+ dropUnused(BB);
+ changed = true;
+ }
+ else
+ break;
+ }
+ if (vectorizeTwice) {
+ MemOpsOnly = false;
+ NoMemOps = true;
+ for (unsigned v = 2, n = 1; v <= VectorWidth;
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "WIV: fusing loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB)) {
+ dropUnused(BB);
+ changed = true;
+ }
+ else
+ break;
+ }
+ }
+
+ if (changed) {
+ vectorizePhiNodes(BB);
+ removeDuplicates(BB);
+ }
+
+ DEBUG(dbgs() << "WIV: done!\n");
+ MemOpsOnly = originalMemOpsOnly;
+ NoMemOps = originalNoMemOps;
+ return changed;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AliasAnalysis>();
+ AU.addRequired<ScalarEvolution>();
+ AU.addPreserved<AliasAnalysis>();
+ AU.addPreserved<ScalarEvolution>();
+ AU.setPreservesCFG();
+ }
+ // This returns the vector type that holds a pair of the provided type.
+ // If the provided type is already a vector, then its length is doubled.
+ static inline VectorType *getVecTypeForVector(Type *ElemTy) {
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ unsigned numElem = VTy->getNumElements();
+ return VectorType::get(ElemTy->getScalarType(), numElem*VectorWidth);
+ } else {
+ return VectorType::get(ElemTy->getScalarType(), VectorWidth);
+
+ }
+
+ return VectorType::get(ElemTy, 2);
+ }
+ // This returns the vector type that holds a pair of the provided type.
+ // If the provided type is already a vector, then its length is doubled.
+ static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+ assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+ "Cannot form vector from incompatible scalar types");
+ Type *STy = ElemTy->getScalarType();
+
+ unsigned numElem;
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ numElem = VTy->getNumElements();
+ } else {
+ numElem = 1;
+ }
+
+ if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+ numElem += VTy->getNumElements();
+ } else {
+ numElem += 1;
+ }
+
+ return VectorType::get(STy, numElem);
+ }
+
+ std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+ unsigned n = 0) {
+ if (!I->hasName())
+ return "";
+
+ return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+ (n > 0 ? "." + utostr(n) : "")).str();
+ }
+
+ // Returns the weight associated with the provided value. A chain of
+ // candidate pairs has a length given by the sum of the weights of its
+ // members (one weight per pair; the weight of each member of the pair
+ // is assumed to be the same). This length is then compared to the
+ // chain-length threshold to determine if a given chain is significant
+ // enough to be vectorized. The length is also used in comparing
+ // candidate chains where longer chains are considered to be better.
+ // Note: when this function returns 0, the resulting instructions are
+ // not actually fused.
+ static inline size_t getDepthFactor(Value *V) {
+ // InsertElement and ExtractElement have a depth factor of zero. This is
+ // for two reasons: First, they cannot be usefully fused. Second, because
+ // the pass generates a lot of these, they can confuse the simple metric
+ // used to compare the trees in the next iteration. Thus, giving them a
+ // weight of zero allows the pass to essentially ignore them in
+ // subsequent iterations when looking for vectorization opportunities
+ // while still tracking dependency chains that flow through those
+ // instructions.
+ if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+ return 0;
+
+ // Give a load or store half of the required depth so that load/store
+ // pairs will vectorize.
+ if ((isa<LoadInst>(V) || isa<StoreInst>(V)))
+ return ReqChainDepth;
+
+ return 1;
+ }
+ // Returns the cost of the provided instruction using VTTI.
+ // This does not handle loads and stores.
+ unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+#ifdef LLVM_3_1
+ return 1;
+#else
+ switch (Opcode) {
+ default: break;
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because scalar GEPs are usually
+ // lowered to the intruction addressing mode. At the moment we don't
+ // generate vector GEPs.
+ return 0;
+ case Instruction::Br:
+ return VTTI->getCFInstrCost(Opcode);
+ case Instruction::PHI:
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return VTTI->getArithmeticInstrCost(Opcode, T1);
+ case Instruction::Select:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast:
+ case Instruction::ShuffleVector:
+ return VTTI->getCastInstrCost(Opcode, T1, T2);
+ }
+ return 1;
+#endif
+ }
+ // This determines the relative offset of two loads or stores, returning
+ // true if the offset could be determined to be some constant value.
+ // For example, if OffsetInElmts == 1, then J accesses the memory directly
+ // after I; if OffsetInElmts == -1 then I accesses the memory
+ // directly after J. This function assumes that both instructions
+ // have the same type.
+ bool getPairPtrInfo(Instruction *I, Instruction *J,
+ Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+ unsigned &IAddressSpace, unsigned &JAddressSpace,
+ int64_t &OffsetInElmts) {
+ OffsetInElmts = 0;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LoadInst *LJ = cast<LoadInst>(J);
+ IPtr = LI->getPointerOperand();
+ JPtr = LJ->getPointerOperand();
+ IAlignment = LI->getAlignment();
+ JAlignment = LJ->getAlignment();
+ IAddressSpace = LI->getPointerAddressSpace();
+ JAddressSpace = LJ->getPointerAddressSpace();
+ } else if (isa<GetElementPtrInst>(I)) {
+ Instruction::op_iterator it = cast<GetElementPtrInst>(I)->idx_begin();
+ IPtr = *it;
+ Instruction::op_iterator jt = cast<GetElementPtrInst>(J)->idx_begin();
+ JPtr = *jt;
+ if (!IPtr || !JPtr)
+ return false;
+ IAlignment = 0;
+ JAlignment = 0;
+ } else {
+ StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+ IPtr = SI->getPointerOperand();
+ JPtr = SJ->getPointerOperand();
+ IAlignment = SI->getAlignment();
+ JAlignment = SJ->getAlignment();
+ IAddressSpace = SI->getPointerAddressSpace();
+ JAddressSpace = SJ->getPointerAddressSpace();
+ }
+ if ((isa<GetElementPtrInst>(I) && !SE->isSCEVable(IPtr->getType()))
+ || (isa<GetElementPtrInst>(J) && !SE->isSCEVable(JPtr->getType()))) {
+ // Asume, the getelementpointer is already vector, so the pointer
+ // operand is also the vector and LLVM scalar evaluation can
+ // not understand it.
+ OffsetInElmts = 2;
+ return true;
+ }
+ const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+ const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+ // If this is a trivial offset, then we'll get something like
+ // 1*sizeof(type). With target data, which we need anyway, this will get
+ // constant folded into a number.
+ const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+ if (const SCEVConstant *ConstOffSCEV =
+ dyn_cast<SCEVConstant>(OffsetSCEV)) {
+ ConstantInt *IntOff = ConstOffSCEV->getValue();
+ int64_t Offset = IntOff->getSExtValue();
+ if (isa<GetElementPtrInst>(I)) {
+ OffsetInElmts = Offset;
+ return (abs64(Offset)) > 1;
+ }
+ Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+ int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+ Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+ if (VTy != VTy2 && Offset < 0) {
+ int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+ OffsetInElmts = Offset/VTy2TSS;
+ return (abs64(Offset) % VTy2TSS) == 0;
+ }
+ OffsetInElmts = Offset/VTyTSS;
+
+ return (abs64(Offset) % VTyTSS) == 0;
+ }
+ return false;
+ }
+
+ // Returns true if the provided CallInst represents an intrinsic that can
+ // be vectorized.
+ bool isVectorizableIntrinsic(CallInst* I) {
+ Function *F = I->getCalledFunction();
+ if (!F) return false;
+
+ unsigned IID = F->getIntrinsicID();
+ if (!IID) return false;
+
+ switch(IID) {
+ default:
+ return false;
+ case Intrinsic::sqrt:
+ case Intrinsic::powi:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ return !NoMath;
+ case Intrinsic::fma:
+ return !NoFMA;
+ }
+ }
+
+ // Returns true if J is the second element in some pair referenced by
+ // some multimap pair iterator pair.
+ template <typename V>
+ bool isSecondInIteratorPair(V J, std::pair<
+ typename std::multimap<V, V>::iterator,
+ typename std::multimap<V, V>::iterator> PairRange) {
+ for (typename std::multimap<V, V>::iterator K = PairRange.first;
+ K != PairRange.second; ++K)
+ if (K->second == J) return true;
+
+ return false;
+ }
+ };
+ // In some cases, instructions did not get combined correctly by previous passes.
+ // For example with large number of replicated work items, scalar load of constant
+ // happened for first work item and then exactly same load in 15 and 30th work item.
+ // The work items in between reused the previous value.
+ // Also, the vectorization vectorization leads to situations where scalar value
+ // needs to be replicated to create vector, however, separate vectors were
+ // created each time the value was to be used.
+ // This fixes that by search for exactly same Instructions, with same type
+ // and exactly same parameters and removing later one of them, replacing
+ // all uses with former.
+ bool WIVectorize::removeDuplicates(BasicBlock &BB) {
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+ BasicBlock::iterator End = BB.end();
+ for (BasicBlock::iterator I = Start; I != End; ++I) {
+ BasicBlock::iterator J = llvm::next(I);
+
+ for ( ; J != End; ) {
+
+ if (isa<AllocaInst>(I) || !I->isIdenticalTo(J)) {
+ J = llvm::next(J);
+ continue;
+ } else {
+ J->replaceAllUsesWith(I);
+ AA->replaceWithNewValue(J, I);
+ SE->forgetValue(J);
+ BasicBlock::iterator K = llvm::next(J);
+ J->eraseFromParent();
+ J = K;
+ }
+ }
+ }
+
+ return false;
+ }
+ // Replace phi nodes of individual valiables with vector they originated
+ // from.
+ bool WIVectorize::vectorizePhiNodes(BasicBlock &BB) {
+ BasicBlock::iterator Start = BB.begin();
+ BasicBlock::iterator End = BB.getFirstInsertionPt();
+
+ ValueVectorMap valueMap;
+ for (BasicBlock::iterator I = Start; I != End; ++I) {
+ PHINode* node = dyn_cast<PHINode>(I);
+ if (node) {
+ ValueVector* candidateVector = new ValueVector;
+ for (BasicBlock::iterator J = llvm::next(I);
+ J != End; ++J) {
+ PHINode* node2 = dyn_cast<PHINode>(J);
+ if (node2) {
+ bool match = true;
+ if (node->getNumIncomingValues() !=
+ node2->getNumIncomingValues())
+ continue;
+
+ for (unsigned int i = 0;
+ i < node->getNumIncomingValues(); i++) {
+ Value* v1 = node->getIncomingValue(i);
+ Value* v2 = node2->getIncomingValue(i);
+ if (node->getIncomingBlock(i) !=
+ node2->getIncomingBlock(i)) {
+ match = false;
+ }
+ // Stored sources contain original value from
+ // which one in phi node was extracted from
+ DenseMap<Value*, Value*>::iterator vi =
+ storedSources.find(v1);
+ if (vi != storedSources.end()) {
+ DenseMap<Value*, Value*>::iterator ji =
+ storedSources.find(v2);
+ if (ji != storedSources.end() &&
+ (*vi).second == (*ji).second) {
+ } else {
+ match = false;
+ }
+ } else {
+ // Incaming value can be also constant, they
+ // have to match.
+ Constant* const1 = dyn_cast<Constant>(v1);
+ Constant* const2 = dyn_cast<Constant>(v2);
+ if (!(const1 && const2)) /* &&
+ const1->getValue() == const2->getValue())) */{
+ match = false;
+ }
+ }
+ }
+ if (match)
+ candidateVector->push_back(node2);
+ }
+ }
+ if (candidateVector->size() == VectorWidth -1) {
+ Value* newV = cast<Value>(node);
+ valueMap[newV] = candidateVector;
+ }
+ }
+ }
+ // Actually create new phi node
+ for (DenseMap<Value*, ValueVector*>::iterator i =
+ valueMap.begin(); i != valueMap.end(); i++) {
+ ValueVector& v = *(*i).second;
+ PHINode* orig = cast<PHINode>((*i).first);
+ Type *IType = orig->getType();
+ Type *VType = getVecTypeForVector(IType);
+ PHINode* phi = PHINode::Create(VType, orig->getNumIncomingValues(),
+ getReplacementName(orig, false,0), orig);
+ // Add incoming pairs to the phi node.
+ for (unsigned int i = 0; i < orig->getNumIncomingValues(); i++) {
+ Value* inc = orig->getIncomingValue(i);
+ BasicBlock* BB = orig->getIncomingBlock(i);
+ DenseMap<Value*, Value*>::iterator iter =
+ storedSources.find(inc);
+ if (iter != storedSources.end()) {
+ phi->addIncoming((*iter).second, BB);
+ } else {
+ Constant* origConst = cast<Constant>(inc);
+ Constant* cons = ConstantVector::getSplat(
+ VectorWidth, origConst);
+ phi->addIncoming(cons, BB);
+ }
+ }
+ // Extract scalar values from phi node to be used in the body
+ // of basic block. Replacing their uses cause instruction combiner
+ // to find extractlement -> insertelement pairs and drop them
+ // leaving direct use of vector.
+ LLVMContext& Context = BB.getContext();
+ BasicBlock::iterator toFill = BB.getFirstInsertionPt();
+ int index = 0;
+
+ // Find from the user of original phi node in which position it
+ // is inserted to the vector before being used by vector instruction.
+ // We have to extract it from same position of the vector phi node.
+ Instruction::use_iterator useiter = orig->use_begin();
+ while (useiter != orig->use_end()) {
+ llvm::User* tmp = *useiter;
+ if (isa<InsertElementInst>(tmp)) {
+ Value* in = tmp->getOperand(2);
+ if (isa<ConstantInt>(in)) {
+ index =
+ cast<ConstantInt>(in)->getZExtValue();
+ break;
+ }
+ }
+ useiter++;
+ }
+
+ //}
+ Value *X = ConstantInt::get(Type::getInt32Ty(Context), index);
+ Instruction* other = ExtractElementInst::Create(phi, X,
+ getReplacementName(phi, false, 0));
+ other->insertAfter(toFill);
+ orig->replaceAllUsesWith(other);
+ AA->replaceWithNewValue(orig, other);
+ SE->forgetValue(orig);
+ orig->eraseFromParent();
+ Instruction* ins = other;
+ for (unsigned int i = 0; i < v.size(); i++) {
+ Instruction* tmp = cast<Instruction>(v[i]);
+ // Find from the user of original phi node in which position it
+ // is inserted to the vector before being used by vector instruction.
+ // We have to extract it from same position of the vector phi node.
+ Instruction::use_iterator ui = tmp->use_begin();
+ while (ui != tmp->use_end()) {
+ llvm::User* user = *ui;
+ if (isa<InsertElementInst>(user)) {
+ Value* in = user->getOperand(2);
+ if (isa<ConstantInt>(in)) {
+ index =
+ cast<ConstantInt>(in)->getZExtValue();
+ break;
+ }
+ }
+ ui++;
+ }
+ X = ConstantInt::get(Type::getInt32Ty(Context), index);
+ Instruction* other = ExtractElementInst::Create(phi, X,
+ getReplacementName(phi, false, index));
+ other->insertAfter(ins);
+
+ tmp->replaceAllUsesWith(other);
+ AA->replaceWithNewValue(tmp, other);
+ SE->forgetValue(tmp);
+ tmp->eraseFromParent();
+ ins = other;
+ }
+
+ }
+ return true;
+ }
+ // This function implements one vectorization iteration on the provided
+ // basic block. It returns true if the block is changed.
+ bool WIVectorize::vectorizePairs(BasicBlock &BB) {
+ bool ShouldContinue;
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+ std::vector<Value *> AllPairableInsts;
+ DenseMap<Value *, Value *> AllChosenPairs;
+
+ std::vector<Value *> PairableInsts;
+ std::multimap<Value *, Value *> CandidatePairs;
+ ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+ PairableInsts);
+ if (PairableInsts.empty()) return false;
+ // Now we have a map of all of the pairable instructions and we need to
+ // select the best possible pairing. A good pairing is one such that the
+ // users of the pair are also paired. This defines a (directed) forest
+ // over the pairs such that two pairs are connected iff the second pair
+ // uses the first.
+
+ // Note that it only matters that both members of the second pair use some
+ // element of the first pair (to allow for splatting).
+
+ std::multimap<ValuePair, ValuePair> ConnectedPairs;
+ computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+
+ // Build the pairable-instruction dependency map
+ DenseSet<ValuePair> PairableInstUsers;
+ buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+ // There is now a graph of the connected pairs. For each variable, pick
+ // the pairing with the largest tree meeting the depth requirement on at
+ // least one branch. Then select all pairings that are part of that tree
+ // and remove them from the list of available pairings and pairable
+ // variables.
+
+ DenseMap<Value *, Value *> ChosenPairs;
+ choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs);
+
+ if (ChosenPairs.empty())
+ return false;
+
+ AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+ PairableInsts.end());
+ AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+ if (AllChosenPairs.empty()) return false;
+ NumFusedOps += AllChosenPairs.size();
+
+ // A set of pairs has now been selected. It is now necessary to replace the
+ // paired instructions with vector instructions. For this procedure each
+ // operand must be replaced with a vector operand. This vector is formed
+ // by using build_vector on the old operands. The replaced values are then
+ // replaced with a vector_extract on the result. Subsequent optimization
+ // passes should coalesce the build/extract combinations.
+
+ fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+
+ return true;
+ }
+
+ // This function returns true if the provided instruction is capable of being
+ // fused into a vector instruction. This determination is based only on the
+ // type and other attributes of the instruction.
+ bool WIVectorize::isInstVectorizable(Instruction *I,
+ bool &IsSimpleLoadStore) {
+ IsSimpleLoadStore = false;
+
+ if (MemOpsOnly &&
+ !(isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I)))
+ return false;
+
+ if (CallInst *C = dyn_cast<CallInst>(I)) {
+ if (!isVectorizableIntrinsic(C)) {
+ return false;
+
+ }
+ } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Vectorize simple loads if possbile:
+ IsSimpleLoadStore = L->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps) {
+ return false;
+ }
+ } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+ // Vectorize simple stores if possbile:
+ IsSimpleLoadStore = S->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps) {
+ return false;
+ }
+ } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+ // We can vectorize casts, but not casts of pointer types, etc.
+
+ Type *SrcTy = C->getSrcTy();
+ if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) {
+ return false;
+ }
+ Type *DestTy = C->getDestTy();
+ if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) {
+ return false;
+ }
+ } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
+ // Currently, vector GEPs exist only with one index.
+ if (G->getNumIndices() != 1 || NoMemOps || NoGEP)
+ return false;
+ } else if (isa<CmpInst>(I)) {
+ if (NoCMP)
+ return false;
+ } else if (!(I->isBinaryOp())){ /*|| isa<ShuffleVectorInst>(I) ||
+ isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {*/
+ return false;
+ }
+ // We can't vectorize memory operations without target data
+ if (TD == 0 && IsSimpleLoadStore)
+ return false;
+
+ Type *T1, *T2;
+ if (isa<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = cast<StoreInst>(I)->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (I->isCast())
+ T2 = cast<CastInst>(I)->getSrcTy();
+ else
+ T2 = T1;
+
+ // Not every type can be vectorized...
+ if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+ !(VectorType::isValidElementType(T2) || T2->isVectorTy())) {
+ return false;
+ }
+ if ((T1->getPrimitiveSizeInBits() > (VectorWidth*32)/2 ||
+ T2->getPrimitiveSizeInBits() > (VectorWidth*32)/2)) {
+ return false;
+ }
+
+ // Floating point vectorization can be dissabled
+ if (I->getType()->isFloatingPointTy() && NoFP)
+ return false;
+
+ // Do not vectorizer pointer types. Currently do not work with LLVM 3.1.
+ if (!isa<GetElementPtrInst>(I) &&
+ (T1->getScalarType()->isPointerTy() ||
+ T2->getScalarType()->isPointerTy()))
+ return false;
+ // Check if the instruction can be loop counter, we do not vectorize those
+ // since they have to be same for all work items we are vectorizing
+ // and computations of load/store indexes usually depenends on them.
+ // Instruction combiner pass will remove duplicates.
+ if (SE->isSCEVable(I->getType())) {
+ const SCEV* sc = SE->getSCEV(I);
+ if (const SCEVAddRecExpr* S = dyn_cast<SCEVAddRecExpr>(sc)) {
+ if (I->hasNUses(2)) {
+ // Loop counter instruction is used in the comparison
+ // operation before branch and with the phi node.
+ // Any more uses indicates that the instruction is also
+ // used as part of some computation and possibly needs
+ // to get vectorize.
+ bool compare = false;
+ bool phi = false;
+ for (Value::use_iterator it = I->use_begin();
+ it != I->use_end();
+ it++) {
+ if (isa<CmpInst>(*it))
+ compare = true;
+ if (isa<PHINode>(*it))
+ phi = true;
+ }
+ if (compare && phi)
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use tree of I.
+ bool WIVectorize::areInstsCompatibleFromDifferentWi(Instruction *I,
+ Instruction *J) {
+
+ if (I->getMetadata("wi") == NULL || J->getMetadata("wi") == NULL) {
+ return false;
+ }
+ if (MemOpsOnly &&
+ !((isa<LoadInst>(I) && isa<LoadInst>(J)) ||
+ (isa<StoreInst>(I) && isa<StoreInst>(J)) ||
+ (isa<GetElementPtrInst>(I) && isa<GetElementPtrInst>(J)))) {
+ return false;
+ }
+ MDNode* mi = I->getMetadata("wi");
+ MDNode* mj = J->getMetadata("wi");
+ assert(mi->getNumOperands() == 3);
+ assert(mj->getNumOperands() == 3);
+
+ // Second operand of MDNode contains MDNode with XYZ tripplet.
+ MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2));
+ MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+ assert(jXYZ->getNumOperands() == 4);
+
+ ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1));
+ ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1));
+
+ ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2));
+ ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2));
+
+ ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3));
+ ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3));
+
+ if ( CIX->getValue() == CJX->getValue()
+ && CIY->getValue() == CJY->getValue()
+ && CIZ->getValue() == CJZ->getValue()) {
+ // Same work item, no vectorizing
+ return false;
+ }
+ mi = I->getMetadata("wi_counter");
+ mj = J->getMetadata("wi_counter");
+
+ ConstantInt *CI = dyn_cast<ConstantInt>(mi->getOperand(1));
+ ConstantInt *CJ = dyn_cast<ConstantInt>(mj->getOperand(1));
+ if (CI->getValue() != CJ->getValue()) {
+ // different line in the original work item
+ // we do not want to vectorize operations that do not match
+ return false;
+ }
+ return true;
+ }
+ static inline void getInstructionTypes(Instruction *I,
+ Type *&T1, Type *&T2) {
+ if (isa<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = cast<StoreInst>(I)->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (I->isCast())
+ T2 = cast<CastInst>(I)->getSrcTy();
+ else
+ T2 = T1;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ T2 = SI->getCondition()->getType();
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ T2 = SI->getOperand(0)->getType();
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+ T2 = CI->getOperand(0)->getType();
+ }
+ }
+
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use tree of I.
+ bool WIVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore) {
+ DEBUG( if (DebugInstructionExamination) dbgs() << "WIV: looking at " << *I <<
+ " <-> " << *J << "\n");
+
+ // Loads and stores can be merged if they have different alignments,
+ // but are otherwise the same.
+ LoadInst *LI, *LJ;
+ StoreInst *SI, *SJ;
+ if (!J->isSameOperationAs(I)) {
+ return false;
+ }
+ Type *IT1, *IT2, *JT1, *JT2;
+ getInstructionTypes(I, IT1, IT2);
+ getInstructionTypes(J, JT1, JT2);
+
+ if (IsSimpleLoadStore || isa<GetElementPtrInst>(I)) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts = 0;
+ bool foundPointer =
+ getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace, OffsetInElmts);
+ if ( foundPointer && abs64(OffsetInElmts) == 1) {
+ Type *aTypeI = isa<StoreInst>(I) ?
+ cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+ Type *aTypeJ = isa<StoreInst>(J) ?
+ cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+ Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
+ // An aligned load or store is possible only if the instruction
+ // with the lower offset has an alignment suitable for the
+ // vector type.
+
+ unsigned BottomAlignment = IAlignment;
+ if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+ unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+ if (AlignedOnly) {
+ if (BottomAlignment < VecAlignment) {
+ return false;
+ }
+ }
+#ifndef LLVM_3_1
+ if (VTTI) {
+ unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+ IAlignment, IAddressSpace);
+ unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
+ JAlignment, JAddressSpace);
+ unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
+ BottomAlignment,
+ IAddressSpace);
+ if (VCost > ICost + JCost)
+ return false;
+
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts = VTTI->getNumberOfParts(VType);
+ if (VParts > 1)
+ return false;
+ else if (!VParts && VCost == ICost + JCost)
+ return false;
+
+ }
+#endif
+ } else if(foundPointer && abs64(OffsetInElmts)>1){
+ if (isa<GetElementPtrInst>(I)) {
+ return true;
+ }
+ // Collect information on memory accesses with stride.
+ // This is not usefull for anything, just to analyze code a bit.
+ if (I->getMetadata("wi") != NULL) {
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI =
+ cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI =
+ cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+ std::pair<int, int> index = std::pair<int,int>(RI,CI);
+ DenseMap<std::pair<int,int>, ValueVector*>::iterator it =
+ stridedOps.find(index);
+ ValueVector* v = NULL;
+ if (it != stridedOps.end()) {
+ v = (*it).second;
+ } else {
+ v = new ValueVector;
+ }
+ v->push_back(I);
+ v->push_back(J);
+ stridedOps.insert(
+ std::pair< std::pair<int, int>, ValueVector*>(index, v));
+ }
+ return false;
+ } else {
+ return false;
+ }
+ } else if (isa<ShuffleVectorInst>(I)) {
+ // Only merge two shuffles if they're both constant
+ return isa<Constant>(I->getOperand(2)) &&
+ isa<Constant>(J->getOperand(2));
+ // FIXME: We may want to vectorize non-constant shuffles also.
+#ifdef LLVM_3_1
+ }
+#else
+ } else if (VTTI) {
+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+ Type *VT1 = getVecTypeForPair(IT1, JT1),
+ *VT2 = getVecTypeForPair(IT2, JT2);
+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+
+ if (VCost > ICost + JCost) {
+ return false;
+ }
+ // We don't want to fuse to a type that will be split, even
+ // if the two input types will also be split and there is no other
+ // associated cost.
+ unsigned VParts1 = VTTI->getNumberOfParts(VT1),
+ VParts2 = VTTI->getNumberOfParts(VT2);
+ if (VParts1 > 1 || VParts2 > 1)
+ return false;
+ else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+ return false;
+
+ //CostSavings = ICost + JCost - VCost;
+ }
+#endif
+ // The powi intrinsic is special because only the first argument is
+ // vectorized, the second arguments must be equal.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ Function *FI;
+ if (CI && (FI = CI->getCalledFunction()) &&
+ FI->getIntrinsicID() == Intrinsic::powi) {
+
+ Value *A1I = CI->getArgOperand(1),
+ *A1J = cast<CallInst>(J)->getArgOperand(1);
+ const SCEV *A1ISCEV = SE->getSCEV(A1I),
+ *A1JSCEV = SE->getSCEV(A1J);
+ return (A1ISCEV == A1JSCEV);
+ }
+ return true;
+ }
+
+ // Figure out whether or not J uses I and update the users and write-set
+ // structures associated with I. Specifically, Users represents the set of
+ // instructions that depend on I. WriteSet represents the set
+ // of memory locations that are dependent on I. If UpdateUsers is true,
+ // and J uses I, then Users is updated to contain J and WriteSet is updated
+ // to contain any memory locations to which J writes. The function returns
+ // true if J uses I. By default, alias analysis is used to determine
+ // whether J reads from memory that overlaps with a location in WriteSet.
+ // If LoadMoveSet is not null, then it is a previously-computed multimap
+ // where the key is the memory-based user instruction and the value is
+ // the instruction to be compared with I. So, if LoadMoveSet is provided,
+ // then the alias analysis is not used. This is necessary because this
+ // function is called during the process of moving instructions during
+ // vectorization and the results of the alias analysis are not stable during
+ // that process.
+ bool WIVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers,
+ std::multimap<Value *, Value *> *LoadMoveSet) {
+ bool UsesI = false;
+
+ // This instruction may already be marked as a user due, for example, to
+ // being a member of a selected pair.
+ if (Users.count(J))
+ UsesI = true;
+
+ if (!UsesI)
+ for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
+ JU != JE; ++JU) {
+ Value *V = *JU;
+ if (I == V || Users.count(V)) {
+ UsesI = true;
+ break;
+ }
+ }
+ if (!UsesI && J->mayReadFromMemory()) {
+ if (LoadMoveSet) {
+ VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
+ UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+ }
+ }
+
+ if (UsesI && UpdateUsers) {
+ if (J->mayWriteToMemory()) WriteSet.add(J);
+ Users.insert(J);
+ }
+
+ return UsesI;
+ }
+
+ // This function iterates over all instruction pairs in the provided
+ // basic block and collects all candidate pairs for vectorization.
+ bool WIVectorize::getCandidatePairs(BasicBlock &BB,
+ BasicBlock::iterator &Start,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts) {
+ BasicBlock::iterator E = BB.end();
+ LLVMContext& context = BB.getContext();
+
+ if (Start == E) return false;
+
+ std::multimap<int, ValueVector*> temporary;
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+
+ if (I->getMetadata("wi") == NULL)
+ continue;
+ bool IsSimpleLoadStore;
+ if (!isInstVectorizable(I, IsSimpleLoadStore)) {
+ continue;
+ }
+
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+
+ std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI);
+ std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI);
+ ValueVector* tmpVec = NULL;
+ while(itb != ite) {
+ if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) {
+ // Test also if instructions are from same region.
+ MDNode* tmpMD =
+ cast<Instruction>((*(*itb).second)[0])->getMetadata("wi");
+ MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1));
+ unsigned tmpRI =
+ cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue();
+ if (RI == tmpRI)
+ tmpVec = (*itb).second;
+ }
+ itb++;
+ }
+ if (tmpVec == NULL) {
+ tmpVec = new ValueVector;
+ temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec));
+ }
+ tmpVec->push_back(I);
+ }
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin();
+ insIt != temporary.end(); insIt++) {
+ ValueVector* tmpVec = (*insIt).second;
+ // Prevent creation of vectors shorter then the vector width in case
+ // vectorization of asymetric counters is disabled.
+ if (tmpVec->size() % 2 != 0 && NoCount) {
+ continue;
+ }
+
+ if (tmpVec->size() % 2 != 0 && !MemOpsOnly) {
+
+ // Ok, this is extremely ugly, however this code is specific for
+ // for situation where the base address of some array is computed
+ // one way and the addresses for the rest of the work items are
+ // computed other way. E.g.
+ // id_0 = x*y*z
+ // id_1 = id_0 + const
+ // id_2 = id_0 + const + const
+ // ...
+ // Therefore only applicable to add operation.
+ // It should bring some performance improvements when targetting TTA.
+
+ // NOTE: results are opposide of what is expected.
+ // With NoCount set to true, the vectorization of loop counter arithmetic
+ // operations is actually prevented. The ProgramPartitioner is assigning
+ // them to the lanes. This seems to provide better performance.
+ // With NoCount set to false, the vectorization of loop counter
+ // arithmetic is allowed, creating better bitcode, but when mapped
+ // to TTA, performance is much worse.
+
+ Instruction* tmp = cast<Instruction>((*tmpVec)[0]);
+ if ( !(tmpVec->size() == 1 ||
+ tmp->getType()->isVectorTy() ||
+ tmp->getOpcode() != Instruction::Add)) {
+
+ bool identity = false;
+ bool argumentOperand = false;
+ // If none of the arguments to add is constant
+ // we do not replace it with identity, neither if operand
+ // is function argument since that can be used in different
+ // blocks.
+ for (unsigned o = 0; o < tmp->getNumOperands(); ++o) {
+ if (isa<ConstantInt>(tmp->getOperand(o))) {
+ identity = true;
+ }
+ if (isa<Argument>(tmp->getOperand(o))) {
+ argumentOperand = true;
+ }
+ }
+ if (!identity || argumentOperand)
+ continue;
+
+ Instruction* K = tmp->clone();
+ if ((*tmpVec)[0]->hasName()) {
+ std::string name = (*tmpVec)[0]->getName().str() + "_temp_0";
+ K->setName(name);
+ }
+
+ if (tmp->getMetadata("wi") != NULL) {
+ MDNode* md = tmp->getMetadata("wi");
+ MDNode* xyz = dyn_cast<MDNode>(md->getOperand(2));
+ MDNode* region = dyn_cast<MDNode>(md->getOperand(1));
+ ConstantInt *CIX =
+ dyn_cast<ConstantInt>(xyz->getOperand(1));
+ ConstantInt *CIY =
+ dyn_cast<ConstantInt>(xyz->getOperand(2));
+ ConstantInt *CIZ =
+ dyn_cast<ConstantInt>(xyz->getOperand(3));
+ if (CIX->getValue() == 1) {
+ Value *v2[] = {
+ MDString::get(context, "WI_xyz"),
+ ConstantInt::get(Type::getInt32Ty(context), 0),
+ CIY,
+ CIZ};
+ MDNode* newXYZ = MDNode::get(context, v2);
+ Value *v[] = {
+ MDString::get(context, "WI_data"),
+ region,
+ newXYZ};
+ MDNode* mdNew = MDNode::get(context, v);
+ K->setMetadata("wi", mdNew);
+ K->setMetadata("wi_counter", tmp->getMetadata("wi_counter"));
+ }
+ }
+ for (unsigned o = 0; o < K->getNumOperands(); ++o) {
+ if (isa<ConstantInt>(K->getOperand(o))) {
+ K->setOperand(o,
+ ConstantInt::get(K->getOperand(o)->getType(), 0));
+ }
+ }
+
+ Value* original = NULL;
+ for (unsigned o = 0; o < K->getNumOperands(); ++o) {
+ if (!isa<PHINode>(K->getOperand(o)) &&
+ isa<Instruction>(K->getOperand(o))) {
+ original = K->getOperand(o);
+ }
+ }
+ if (original != NULL) {
+ K->insertAfter(cast<Instruction>(original));
+ std::vector<User*> usesToReplace;
+ for (Value::use_iterator it = original->use_begin();
+ it != original->use_end();
+ it++) {
+ bool usedInVec = false;
+ if (*it != K) {
+ if (!NoCount) {
+ for (unsigned int j = 0; j < tmpVec->size(); j++) {
+ if ((*it) == (*tmpVec)[j]) {
+ usedInVec = true;
+ break;
+ }
+ }
+ }
+ if (!usedInVec) {
+ usesToReplace.push_back(*it);
+ }
+ }
+ }
+ for (unsigned int j = 0; j < usesToReplace.size(); j++) {
+ usesToReplace[j]->replaceUsesOfWith(original, K);
+ }
+ } else {
+ K->insertBefore(tmp);
+ }
+ tmpVec->insert(tmpVec->begin(), K);
+ }
+ }
+
+ // Create actual candidate pairs
+ for (unsigned j = 0; j < tmpVec->size()/2; j++) {
+ Instruction* I = cast<Instruction>((*tmpVec)[2*j]);
+ Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]);
+ if (!areInstsCompatibleFromDifferentWi(I,J)) continue;
+ bool IsSimpleLoadStore;
+
+ if (!isInstVectorizable(I, IsSimpleLoadStore)) {
+ break;
+ }
+
+ if (!areInstsCompatible(I, J, IsSimpleLoadStore)) {
+ break;
+ }
+
+ // Determine if J uses I, if so, exit the loop.
+ bool UsesI = trackUsesOfI(Users, WriteSet, I, J, true);
+ if (UsesI) {
+ break;
+ }
+
+ if (!PairableInsts.size() ||
+ PairableInsts[PairableInsts.size()-1] != I) {
+ PairableInsts.push_back(I);
+ }
+ CandidatePairs.insert(ValuePair(I, J));
+ }
+ }
+ return false;
+ }
+
+ // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+ // it looks for pairs such that both members have an input which is an
+ // output of PI or PJ.
+ void WIVectorize::computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P) {
+ StoreInst *SI, *SJ;
+ // For each possible pairing for this variable, look at the uses of
+ // the first value...
+ for (Value::use_iterator I = P.first->use_begin(),
+ E = P.first->use_end(); I != E; ++I) {
+ if (isa<LoadInst>(*I)) {
+ // A pair cannot be connected to a load because the load only takes one
+ // operand (the address) and it is a scalar even after vectorization.
+ continue;
+ } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+ P.first == SI->getPointerOperand()) {
+ // Similarly, a pair cannot be connected to a store through its
+ // pointer operand.
+ continue;
+ }
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ // For each use of the first variable, look for uses of the second
+ // variable...
+ for (Value::use_iterator J = P.second->use_begin(),
+ E2 = P.second->use_end(); J != E2; ++J) {
+
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
+
+ // Look for <I, J>:
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+
+ // Look for <J, I>:
+ if (isSecondInIteratorPair<Value*>(*I, JPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+ }
+ // Look for cases where just the first value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.first == SJ->getPointerOperand())
+ continue;
+
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+ // Look for cases where just the second value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator I = P.second->use_begin(),
+ E = P.second->use_end(); I != E; ++I) {
+ if (isa<LoadInst>(*I)) {
+ continue;
+ } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+ P.second == SI->getPointerOperand()) {
+ continue;
+ }
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+ if ((SJ = dyn_cast<StoreInst>(*J)) &&
+ P.second == SJ->getPointerOperand())
+ continue;
+
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+ }
+
+ // This function figures out which pairs are connected. Two pairs are
+ // connected if some output of the first pair forms an input to both members
+ // of the second pair.
+ void WIVectorize::computeConnectedPairs(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PE = PairableInsts.end(); PI != PE; ++PI) {
+ VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+
+ for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
+ P != choiceRange.second; ++P)
+ computePairsConnectedTo(CandidatePairs, PairableInsts,
+ ConnectedPairs, *P);
+ }
+
+ DEBUG(dbgs() << "WIV: found " << ConnectedPairs.size()
+ << " pair connections.\n");
+ }
+
+ // This function builds a set of use tuples such that <A, B> is in the set
+ // if B is in the use tree of A. If B is in the use tree of A, then B
+ // depends on the output of A.
+ void WIVectorize::buildDepMap(
+ BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ DenseSet<ValuePair> &PairableInstUsers) {
+ DenseSet<Value *> IsInPair;
+ for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
+ E = CandidatePairs.end(); C != E; ++C) {
+ IsInPair.insert(C->first);
+ IsInPair.insert(C->second);
+ }
+
+ // Iterate through the basic block, recording all Users of each
+ // pairable instruction.
+
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ if (IsInPair.find(I) == IsInPair.end()) continue;
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+ (void) trackUsesOfI(Users, WriteSet, I, J);
+
+ for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+ U != E; ++U)
+ PairableInstUsers.insert(ValuePair(I, *U));
+ }
+ }
+
+ // Returns true if an input to pair P is an output of pair Q and also an
+ // input of pair Q is an output of pair P. If this is the case, then these
+ // two pairs cannot be simultaneously fused.
+ bool WIVectorize::pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+
+ // Two pairs are in conflict if they are mutual Users of eachother.
+ bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.second));
+ bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.second));
+ if (PairableInstUserMap) {
+ // FIXME: The expensive part of the cycle check is not so much the cycle
+ // check itself but this edge insertion procedure. This needs some
+ // profiling and probably a different data structure (same is true of
+ // most uses of std::multimap).
+ if (PUsesQ) {
+ VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
+ if (!isSecondInIteratorPair(P, QPairRange))
+ PairableInstUserMap->insert(VPPair(Q, P));
+ }
+ if (QUsesP) {
+ VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
+ if (!isSecondInIteratorPair(Q, PPairRange))
+ PairableInstUserMap->insert(VPPair(P, Q));
+ }
+ }
+
+ return (QUsesP && PUsesQ);
+ }
+
+ // This function walks the use graph of current pairs to see if, starting
+ // from P, the walk returns to P.
+ bool WIVectorize::pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseSet<ValuePair> &CurrentPairs) {
+
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "WIV: starting cycle check for : " << *P.first << " <-> "
+ << *P.second << "\n");
+ // A lookup table of visisted pairs is kept because the PairableInstUserMap
+ // contains non-direct associations.
+ DenseSet<ValuePair> Visited;
+ SmallVector<ValuePair, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(P);
+ do {
+ ValuePair QTop = Q.pop_back_val();
+ Visited.insert(QTop);
+
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "WIV: cycle check visiting: " << *QTop.first << " <-> "
+ << *QTop.second << "\n");
+ VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
+ for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
+ C != QPairRange.second; ++C) {
+ if (C->second == P) {
+ DEBUG(dbgs()
+ << "WIV: rejected to prevent non-trivial cycle formation: "
+ << *C->first.first << " <-> " << *C->first.second << "\n");
+ return true;
+ }
+
+ if (CurrentPairs.count(C->second) && !Visited.count(C->second))
+ Q.push_back(C->second);
+ }
+ } while (!Q.empty());
+
+ return false;
+ }
+
+ // This function builds the initial tree of connected pairs with the
+ // pair J at the root.
+ void WIVectorize::buildInitialTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *>& /*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair>& /*PairableInstUsers*/,
+ DenseMap<Value *, Value *>& /*ChosenPairs*/,
+ DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
+ // Each of these pairs is viewed as the root node of a Tree. The Tree
+ // is then walked (depth-first). As this happens, we keep track of
+ // the pairs that compose the Tree and the maximum depth of the Tree.
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.back();
+
+ // Push each child onto the queue:
+ bool MoreChildren = false;
+ size_t MaxChildDepth = QTop.second;
+ VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
+ for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
+ k != qtRange.second; ++k) {
+ // Make sure that this child pair is still a candidate:
+ bool IsStillCand = false;
+ VPIteratorPair checkRange =
+ CandidatePairs.equal_range(k->second.first);
+ for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
+ m != checkRange.second; ++m) {
+ if (m->second == k->second.second) {
+ IsStillCand = true;
+ break;
+ }
+ }
+
+ if (IsStillCand) {
+ DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
+ if (C == Tree.end()) {
+ size_t d = getDepthFactor(k->second.first);
+ Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
+ MoreChildren = true;
+ } else {
+ MaxChildDepth = std::max(MaxChildDepth, C->second);
+ }
+ }
+ }
+
+ if (!MoreChildren) {
+ // Record the current pair as part of the Tree:
+ Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+ Q.pop_back();
+ }
+ } while (!Q.empty());
+ }
+
+ // Given some initial tree, prune it by removing conflicting pairs (pairs
+ // that cannot be simultaneously chosen for vectorization).
+ void WIVectorize::pruneTreeFor(
+ std::multimap<Value *, Value *> &/*CandidatePairs*/,
+ std::vector<Value *> &/*PairableInsts*/,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree,
+ DenseSet<ValuePair> &PrunedTree, ValuePair J,
+ bool UseCycleCheck) {
+ SmallVector<ValuePairWithDepth, 32> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+ do {
+ ValuePairWithDepth QTop = Q.pop_back_val();
+ PrunedTree.insert(QTop.first);
+
+ // Visit each child, pruning as necessary...
+ DenseMap<ValuePair, size_t> BestChildren;
+ VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
+ for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
+ K != QTopRange.second; ++K) {
+ DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
+ if (C == Tree.end()) continue;
+
+ // This child is in the Tree, now we need to make sure it is the
+ // best of any conflicting children. There could be multiple
+ // conflicting children, so first, determine if we're keeping
+ // this child, then delete conflicting children as necessary.
+
+ // It is also necessary to guard against pairing-induced
+ // dependencies. Consider instructions a .. x .. y .. b
+ // such that (a,b) are to be fused and (x,y) are to be fused
+ // but a is an input to x and b is an output from y. This
+ // means that y cannot be moved after b but x must be moved
+ // after b for (a,b) to be fused. In other words, after
+ // fusing (a,b) we have y .. a/b .. x where y is an input
+ // to a/b and x is an output to a/b: x and y can no longer
+ // be legally fused. To prevent this condition, we must
+ // make sure that a child pair added to the Tree is not
+ // both an input and output of an already-selected pair.
+
+ // Pairing-induced dependencies can also form from more complicated
+ // cycles. The pair vs. pair conflicts are easy to check, and so
+ // that is done explicitly for "fast rejection", and because for
+ // child vs. child conflicts, we may prefer to keep the current
+ // pair in preference to the already-selected child.
+ DenseSet<ValuePair> CurrentPairs;
+
+ bool CanAdd = true;
+ for (DenseMap<ValuePair, size_t>::iterator C2
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ if (C2->second >= C->second) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ }
+ if (!CanAdd) continue;
+
+ // Even worse, this child could conflict with another node already
+ // selected for the Tree. If that is the case, ignore this child.
+ for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
+ E2 = PrunedTree.end(); T != E2; ++T) {
+ if (T->first == C->first.first ||
+ T->first == C->first.second ||
+ T->second == C->first.first ||
+ T->second == C->first.second ||
+ pairsConflict(*T, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*T);
+ }
+ if (!CanAdd) continue;
+
+ // And check the queue too...
+ for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+ E2 = Q.end(); C2 != E2; ++C2) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(C2->first);
+ }
+ if (!CanAdd) continue;
+
+ // Last but not least, check for a conflict with any of the
+ // already-chosen pairs.
+ for (DenseMap<Value *, Value *>::iterator C2 =
+ ChosenPairs.begin(), E2 = ChosenPairs.end();
+ C2 != E2; ++C2) {
+ if (pairsConflict(*C2, C->first, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ CanAdd = false;
+ break;
+ }
+
+ CurrentPairs.insert(*C2);
+ }
+ if (!CanAdd) continue;
+
+ // To check for non-trivial cycles formed by the addition of the
+ // current pair we've formed a list of all relevant pairs, now use a
+ // graph walk to check for a cycle. We start from the current pair and
+ // walk the use tree to see if we again reach the current pair. If we
+ // do, then the current pair is rejected.
+
+ // FIXME: It may be more efficient to use a topological-ordering
+ // algorithm to improve the cycle check. This should be investigated.
+ if (UseCycleCheck &&
+ pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+ continue;
+
+ // This child can be added, but we may have chosen it in preference
+ // to an already-selected child. Check for this here, and if a
+ // conflict is found, then remove the previously-selected child
+ // before adding this one in its place.
+ for (DenseMap<ValuePair, size_t>::iterator C2
+ = BestChildren.begin(); C2 != BestChildren.end();) {
+ if (C2->first.first == C->first.first ||
+ C2->first.first == C->first.second ||
+ C2->first.second == C->first.first ||
+ C2->first.second == C->first.second ||
+ pairsConflict(C2->first, C->first, PairableInstUsers))
+ BestChildren.erase(C2++);
+ else
+ ++C2;
+ }
+
+ BestChildren.insert(ValuePairWithDepth(C->first, C->second));
+ }
+
+ for (DenseMap<ValuePair, size_t>::iterator C
+ = BestChildren.begin(), E2 = BestChildren.end();
+ C != E2; ++C) {
+ size_t DepthF = getDepthFactor(C->first.first);
+ Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+ }
+ } while (!Q.empty());
+ }
+
+ // This function finds the best tree of mututally-compatible connected
+ // pairs, given the choice of root pairs as an iterator range.
+ void WIVectorize::findBestTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+ size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ bool UseCycleCheck) {
+ for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
+ J != ChoiceRange.second; ++J) {
+
+ // Before going any further, make sure that this pair does not
+ // conflict with any already-selected pairs (see comment below
+ // near the Tree pruning for more details).
+ DenseSet<ValuePair> ChosenPairSet;
+ bool DoesConflict = false;
+ for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+ E = ChosenPairs.end(); C != E; ++C) {
+ if (pairsConflict(*C, *J, PairableInstUsers,
+ UseCycleCheck ? &PairableInstUserMap : 0)) {
+ DoesConflict = true;
+ break;
+ }
+
+ ChosenPairSet.insert(*C);
+ }
+ if (DoesConflict) continue;
+
+ if (UseCycleCheck &&
+ pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+ continue;
+
+ DenseMap<ValuePair, size_t> Tree;
+ buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs, Tree, *J);
+
+ // Because we'll keep the child with the largest depth, the largest
+ // depth is still the same in the unpruned Tree.
+ size_t MaxDepth = Tree.lookup(*J);
+
+ DEBUG(if (DebugPairSelection) dbgs() << "WIV: found Tree for pair {"
+ << *J->first << " <-> " << *J->second << "} of depth " <<
+ MaxDepth << " and size " << Tree.size() << "\n");
+
+ // At this point the Tree has been constructed, but, may contain
+ // contradictory children (meaning that different children of
+ // some tree node may be attempting to fuse the same instruction).
+ // So now we walk the tree again, in the case of a conflict,
+ // keep only the child with the largest depth. To break a tie,
+ // favor the first child.
+
+ DenseSet<ValuePair> PrunedTree;
+ pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
+ PrunedTree, *J, UseCycleCheck);
+
+ size_t EffSize = 0;
+ for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+ E = PrunedTree.end(); S != E; ++S)
+ EffSize += getDepthFactor(S->first);
+
+ DEBUG(if (DebugPairSelection)
+ dbgs() << "WIV: found pruned Tree for pair {"
+ << *J->first << " <-> " << *J->second << "} of depth " <<
+ MaxDepth << " and size " << PrunedTree.size() <<
+ " (effective size: " << EffSize << ")\n");
+#if defined LLVM_3_1
+ if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
+#else
+ if ((VTTI || MaxDepth >= ReqChainDepth) && EffSize > BestEffSize) {
+#endif
+ BestMaxDepth = MaxDepth;
+ BestEffSize = EffSize;
+ BestTree = PrunedTree;
+ }
+ }
+ }
+
+ // Given the list of candidate pairs, this function selects those
+ // that will be fused into vector instructions.
+ void WIVectorize::choosePairs(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs) {
+ bool UseCycleCheck = true;
+ std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+ for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+ E = PairableInsts.end(); I != E; ++I) {
+ // The number of possible pairings for this variable:
+ size_t NumChoices = CandidatePairs.count(*I);
+ if (!NumChoices) continue;
+
+ VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+
+ // The best pair to choose and its tree:
+ size_t BestMaxDepth = 0, BestEffSize = 0;
+ DenseSet<ValuePair> BestTree;
+ findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, PairableInstUserMap, ChosenPairs,
+ BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+ UseCycleCheck);
+
+ // A tree has been chosen (or not) at this point. If no tree was
+ // chosen, then this instruction, I, cannot be paired (and is no longer
+ // considered).
+
+ DEBUG(if (BestTree.size() > 0)
+ dbgs() << "WIV: selected pairs in the best tree for: "
+ << *cast<Instruction>(*I) << "\n");
+
+ for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
+ SE2 = BestTree.end(); S != SE2; ++S) {
+ // Insert the members of this tree into the list of chosen pairs.
+ ChosenPairs.insert(ValuePair(S->first, S->second));
+ DEBUG(dbgs() << "WIV: selected pair: " << *S->first << " <-> " <<
+ *S->second << "\n");
+
+ // Remove all candidate pairs that have values in the chosen tree.
+ for (std::multimap<Value *, Value *>::iterator K =
+ CandidatePairs.begin(); K != CandidatePairs.end();) {
+ if (K->first == S->first || K->second == S->first ||
+ K->second == S->second || K->first == S->second) {
+ // Don't remove the actual pair chosen so that it can be used
+ // in subsequent tree selections.
+ if (!(K->first == S->first && K->second == S->second))
+ CandidatePairs.erase(K++);
+ else
+ ++K;
+ } else {
+ ++K;
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "WIV: selected " << ChosenPairs.size() << " pairs.\n");
+ }
+
+ // Returns the value that is to be used as the pointer input to the vector
+ // instruction that fuses I with J.
+ Value *WIVectorize::getReplacementPointerInput(LLVMContext& /*Context*/,
+ Instruction *I, Instruction *J, unsigned o,
+ bool FlipMemInputs) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts;
+
+ // Note: the analysis might fail here, that is why the pair order has
+ // been precomputed (OffsetInElmts must be unused here).
+ (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ IAddressSpace, JAddressSpace,
+ OffsetInElmts);
+
+ // The pointer value is taken to be the one with the lowest offset.
+ Value *VPtr;
+ if (!FlipMemInputs) {
+ VPtr = IPtr;
+ } else {
+ FlipMemInputs = true;
+ VPtr = JPtr;
+ }
+
+ // If pointer source is another bitcast, go directly to original
+ // instruction.
+ if (isa<BitCastInst>(VPtr)) {
+ VPtr = cast<BitCastInst>(VPtr)->getOperand(0);
+ }
+ Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+ Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+ Type *VArgPtrType = PointerType::get(VArgType,
+ cast<PointerType>(IPtr->getType())->getAddressSpace());
+ BitCastInst* b = new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+ /* insert before */ FlipMemInputs ? J : I);
+
+ if (I->getMetadata("wi") != NULL) {
+ b->setMetadata("wi", I->getMetadata("wi"));
+ b->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ return b;
+ }
+
+ void WIVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+ unsigned IdxOffset, std::vector<Constant*> &Mask) {
+ for (unsigned v = 0; v < NumElem/2; ++v) {
+ int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+ if (m < 0) {
+ Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+ } else {
+ unsigned mm = m + (int) IdxOffset;
+ if (m >= (int) NumInElem)
+ mm += (int) NumInElem;
+
+ Mask[v+MaskOffset] =
+ ConstantInt::get(Type::getInt32Ty(Context), mm);
+ }
+ }
+ }
+
+ // Returns the value that is to be used as the vector-shuffle mask to the
+ // vector instruction that fuses I with J.
+ Value *WIVectorize::getReplacementShuffleMask(LLVMContext& Context,
+ Instruction *I, Instruction *J) {
+ // This is the shuffle mask. We need to append the second
+ // mask to the first, and the numbers need to be adjusted.
+
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+ // Get the total number of elements in the fused vector type.
+ // By definition, this must equal the number of elements in
+ // the final mask.
+ unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+ std::vector<Constant*> Mask(NumElem);
+
+ Type *OpType = I->getOperand(0)->getType();
+ unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+
+ // For the mask from the first pair...
+ fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+
+ // For the mask from the second pair...
+ fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
+ Mask);
+
+ return ConstantVector::get(Mask);
+ }
+
+ Value *WIVectorize::CommonShuffleSource(Instruction *I, Instruction *J) {
+ DenseMap<Value*, Value*>::iterator vi = storedSources.find(I);
+ DenseMap<Value*, Value*>::iterator vj = storedSources.find(J);
+ if (vi != storedSources.end()
+ && vj != storedSources.end()) {
+ if ((*vi).second == (*vj).second) {
+ return (*vi).second;
+ }
+ }
+ return NULL;
+ }
+ // Returns the value to be used as the specified operand of the vector
+ // instruction that fuses I with J.
+ Value *WIVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs) {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+ // Compute the fused vector type for this operand
+ Type *ArgType = I->getOperand(o)->getType();
+ Type *ArgTypeJ = J->getOperand(o)->getType();
+ VectorType *VArgType = getVecTypeForPair(ArgType, ArgTypeJ);
+ Instruction *L = I, *H = J;
+ if (FlipMemInputs) {
+ L = J;
+ H = I;
+ }
+
+ if (ArgType->isVectorTy()) {
+ ShuffleVectorInst *LSV
+ = dyn_cast<ShuffleVectorInst>(L->getOperand(o));
+ ShuffleVectorInst *HSV
+ = dyn_cast<ShuffleVectorInst>(H->getOperand(o));
+ if (LSV && HSV &&
+ LSV->getOperand(0)->getType() == HSV->getOperand(0)->getType() &&
+ LSV->getOperand(1)->getType() == HSV->getOperand(1)->getType() &&
+ LSV->getOperand(2)->getType() == HSV->getOperand(2)->getType()) {
+ if (LSV->getOperand(0) == HSV->getOperand(0) &&
+ LSV->getOperand(1) == HSV->getOperand(1)) {
+ if (LSV->getOperand(2)->getType()->getVectorNumElements() ==
+ HSV->getOperand(2)->getType()->getVectorNumElements()) {
+ unsigned elems =
+ LSV->getOperand(2)->getType()->getVectorNumElements();
+ bool continous = true;
+ bool identical = true;
+ unsigned start = cast<ShuffleVectorInst>(LSV)->getMaskValue(0);
+ for (unsigned i = 0; i < elems; i++) {
+ unsigned m = cast<ShuffleVectorInst>(LSV)->getMaskValue(i);
+ if (m != i)
+ continous = false;
+ if (m != start)
+ identical = false;
+ unsigned n = cast<ShuffleVectorInst>(HSV)->getMaskValue(i);
+ if (n != i + elems)
+ continous = false;
+ if (n != start)
+ identical = false;
+ }
+ // This is the case where both sources come from same value and
+ // are in order. e.g. 0,1,2,3,4,5,6,7, as produced when
+ // replacing outputs of vector operation.
+ if (continous && VArgType->getVectorNumElements() == elems*2) {
+ return LSV->getOperand(0);
+ }
+ // This is case where single value of input vector is replicated
+ // to whole output. Eventually should turn to buildvector MI.
+ if (identical) {
+ unsigned numElem =
+ cast<VectorType>(VArgType)->getNumElements();
+ std::vector<Constant*> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v)
+ Mask[v] =
+ ConstantInt::get(Type::getInt32Ty(Context), start);
+
+ Instruction *BV = new ShuffleVectorInst(
+ (start < numElem/2) ?
+ LSV->getOperand(0):
+ LSV->getOperand(1),
+ UndefValue::get(LSV->getOperand(0)->getType()),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (LSV->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", LSV->getMetadata("wi"));
+ BV->setMetadata("wi_counter", LSV->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+ }
+ }
+#if 0
+ // This was made obsolete by test for continuity of shuffle indexes above
+ // and should be removed after futher tests for performance degradation.
+ Value* res = CommonShuffleSource(LSV, HSV);
+ if (res &&
+ res->getType()->getVectorNumElements() ==
+ VArgType->getVectorNumElements()) {
+ return res;
+ }
+#endif
+ }
+ InsertElementInst *LIN
+ = dyn_cast<InsertElementInst>(L->getOperand(o));
+ InsertElementInst *HIN
+ = dyn_cast<InsertElementInst>(H->getOperand(o));
+
+ unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+ if (LIN && HIN) {
+ Instruction *newIn = InsertElementInst::Create(
+ UndefValue::get(VArgType),
+ LIN->getOperand(1),
+ LIN->getOperand(2),
+ getReplacementName(I, true, o, 1));
+ if (I->getMetadata("wi")) {
+ newIn->setMetadata("wi", I->getMetadata("wi"));
+ newIn->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ newIn->insertBefore(J);
+
+ LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0));
+ int counter = 2;
+ int rounds = 0;
+ while (rounds < 2) {
+ while(LIN) {
+ unsigned Indx = cast<ConstantInt>(LIN->getOperand(2))->getZExtValue();
+ Indx += rounds * (numElem/2);
+ Value *newIndx = ConstantInt::get(Type::getInt32Ty(Context), Indx);
+ newIn = InsertElementInst::Create(
+ newIn,
+ LIN->getOperand(1),
+ newIndx,
+ getReplacementName(I, true, o ,counter));
+ counter++;
+ if (I->getMetadata("wi")) {
+ newIn->setMetadata("wi", I->getMetadata("wi"));
+ newIn->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ newIn->insertBefore(J);
+ LIN = dyn_cast<InsertElementInst>(LIN->getOperand(0));
+ }
+ rounds ++;
+ LIN = HIN;
+ }
+ return newIn;
+
+ }
+ std::vector<Constant*> Mask(numElem);
+ for (unsigned v = 0; v < numElem; ++v)
+ Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+
+ Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
+ H->getOperand(o),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (L->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", L->getMetadata("wi"));
+ BV->setMetadata("wi_counter", L->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ // If these two inputs are the output of another vector instruction,
+ // then we should use that output directly. It might be necessary to
+ // permute it first. [When pairings are fused recursively, you can
+ // end up with cases where a large vector is decomposed into scalars
+ // using extractelement instructions, then built into size-2
+ // vectors using insertelement and the into larger vectors using
+ // shuffles. InstCombine does not simplify all of these cases well,
+ // and so we make sure that shuffles are generated here when possible.
+ ExtractElementInst *LEE
+ = dyn_cast<ExtractElementInst>(L->getOperand(o));
+ ExtractElementInst *HEE
+ = dyn_cast<ExtractElementInst>(H->getOperand(o));
+
+ if (LEE && HEE &&
+ LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
+ VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
+ unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
+ unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
+ if (LEE->getOperand(0) == HEE->getOperand(0)) {
+ if (LowIndx == 0 && HighIndx == 1)
+ return LEE->getOperand(0);
+
+ std::vector<Constant*> Mask(2);
+ Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+ Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+ Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+ UndefValue::get(EEType),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (I->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", I->getMetadata("wi"));
+ BV->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ std::vector<Constant*> Mask(2);
+ HighIndx += EEType->getNumElements();
+ Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+ Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+ Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+ HEE->getOperand(0),
+ ConstantVector::get(Mask),
+ getReplacementName(I, true, o));
+ if (I->getMetadata("wi") != NULL) {
+ BV->setMetadata("wi", I->getMetadata("wi"));
+ BV->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ BV->insertBefore(J);
+ return BV;
+ }
+
+ Instruction *BV1 = InsertElementInst::Create(
+ UndefValue::get(VArgType),
+ L->getOperand(o), CV0,
+ getReplacementName(I, true, o, 1));
+ if (I->getMetadata("wi") != NULL) {
+ BV1->setMetadata("wi", I->getMetadata("wi"));
+ BV1->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+
+ BV1->insertBefore(I);
+
+ Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
+ CV1,
+ getReplacementName(I, true, o, 2));
+ if (J->getMetadata("wi") != NULL) {
+ BV2->setMetadata("wi",J->getMetadata("wi"));
+ BV2->setMetadata("wi_counter",J->getMetadata("wi_counter"));
+ }
+ BV2->insertBefore(J);
+ return BV2;
+ }
+
+ // This function creates an array of values that will be used as the inputs
+ // to the vector instruction that fuses I with J.
+ void WIVectorize::getReplacementInputsForPair(LLVMContext& Context,
+ Instruction *I, Instruction *J,
+ SmallVector<Value *, 3> &ReplacedOperands,
+ bool FlipMemInputs) {
+ unsigned NumOperands = I->getNumOperands();
+
+ for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+ // Iterate backward so that we look at the store pointer
+ // first and know whether or not we need to flip the inputs.
+
+ if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+ // This is the pointer for a load/store instruction.
+ ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
+ FlipMemInputs);
+ continue;
+ } else if (isa<CallInst>(I)) {
+ Function *F = cast<CallInst>(I)->getCalledFunction();
+ unsigned IID = F->getIntrinsicID();
+ if (o == NumOperands-1) {
+ BasicBlock &BB = *I->getParent();
+
+ Module *M = BB.getParent()->getParent();
+ Type *ArgTypeI = I->getType();
+ Type *ArgTypeJ = J->getType();
+ Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+ // FIXME: is it safe to do this here?
+ ReplacedOperands[o] = Intrinsic::getDeclaration(M,
+ (Intrinsic::ID) IID, VArgType);
+ continue;
+ } else if (IID == Intrinsic::powi && o == 1) {
+ // The second argument of powi is a single integer and we've already
+ // checked that both arguments are equal. As a result, we just keep
+ // I's second argument.
+ ReplacedOperands[o] = I->getOperand(o);
+ continue;
+ }
+ } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+ ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+ continue;
+ }
+
+ ReplacedOperands[o] =
+ getReplacementInput(Context, I, J, o, FlipMemInputs);
+ }
+ }
+ // As with the aliasing information, SCEV can also change because of
+ // vectorization. This information is used to compute relative pointer
+ // offsets; the necessary information will be cached here prior to
+ // fusion.
+ void WIVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<Value *> &LowPtrInsts) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PIE = PairableInsts.end(); PI != PIE; ++PI) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+ if (P == ChosenPairs.end()) continue;
+
+ Instruction *I = cast<Instruction>(P->first);
+ Instruction *J = cast<Instruction>(P->second);
+
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<GetElementPtrInst>(I))
+ continue;
+
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+ int64_t OffsetInElmts;
+ if (!getPairPtrInfo(
+ I, J, IPtr, JPtr, IAlignment, JAlignment, IAddressSpace,
+ JAddressSpace, OffsetInElmts) || abs64(OffsetInElmts) != 1) {
+ if (!isa<GetElementPtrInst>(I))
+ llvm_unreachable("Pre-fusion pointer analysis failed");
+ }
+ Value *LowPI = (OffsetInElmts > 0) ? I : J;
+ LowPtrInsts.insert(LowPI);
+ }
+ }
+
+ // This function creates two values that represent the outputs of the
+ // original I and J instructions. These are generally vector shuffles
+ // or extracts. In many cases, these will end up being unused and, thus,
+ // eliminated by later passes.
+ void WIVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt,
+ Instruction *&K1, Instruction *&K2,
+ bool FlipMemInputs) {
+ Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+ Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+ if (isa<StoreInst>(I)) {
+ AA->replaceWithNewValue(I, K);
+ AA->replaceWithNewValue(J, K);
+ } else {
+ Type *IType = I->getType();
+ Type *JType = J->getType();
+
+ VectorType *VType = getVecTypeForPair(IType, JType);
+
+ if (IType->isVectorTy()) {
+ unsigned numElem = cast<VectorType>(IType)->getNumElements();
+ std::vector<Constant*> Mask1(numElem), Mask2(numElem);
+ for (unsigned v = 0; v < numElem; ++v) {
+ Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+ Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
+ }
+
+ K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(
+ FlipMemInputs ? Mask2 : Mask1),
+ getReplacementName(K, false, 1));
+ K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+ ConstantVector::get(
+ FlipMemInputs ? Mask1 : Mask2),
+ getReplacementName(K, false, 2));
+ storedSources.insert(ValuePair(FlipMemInputs ? K1 : K2, K));
+ storedSources.insert(ValuePair(FlipMemInputs ? K2 : K1, K));
+ flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K1 : K2));
+ flippedStoredSources.insert(ValuePair(K, FlipMemInputs ? K2 : K1));
+ Instruction* L = I;
+ Instruction* H = J;
+ if (FlipMemInputs) {
+ L = J;
+ H = I;
+ }
+ VPIteratorPair v1 =
+ flippedStoredSources.equal_range(L);
+ for (std::multimap<Value*, Value*>::iterator ii = v1.first;
+ ii != v1.second; ii++) {
+ storedSources.erase((*ii).second);
+ storedSources.insert(ValuePair((*ii).second,K));
+ flippedStoredSources.insert(ValuePair(K, (*ii).second));
+ storedSources.erase(L);
+ }
+ flippedStoredSources.erase(L);
+ VPIteratorPair v2 = flippedStoredSources.equal_range(H);
+ for (std::multimap<Value*, Value*>::iterator ji = v2.first;
+ ji != v2.second; ji++) {
+ storedSources.erase((*ji).second);
+ storedSources.insert(ValuePair((*ji).second,K));
+ flippedStoredSources.insert(ValuePair(K, (*ji).second));
+ storedSources.erase(H);
+ }
+ flippedStoredSources.erase(H);
+ } else {
+ K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+ getReplacementName(K, false, 1));
+ K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+ getReplacementName(K, false, 2));
+ storedSources.insert(ValuePair(K1,K));
+ storedSources.insert(ValuePair(K2,K));
+ flippedStoredSources.insert(ValuePair(K, K1));
+ flippedStoredSources.insert(ValuePair(K, K2));
+ }
+ if (I->getMetadata("wi") != NULL) {
+ K1->setMetadata("wi", I->getMetadata("wi"));
+ K1->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ if (J->getMetadata("wi") != NULL) {
+ K2->setMetadata("wi", J->getMetadata("wi"));
+ K2->setMetadata("wi_counter", J->getMetadata("wi_counter"));
+ }
+
+ K1->insertAfter(K);
+ K2->insertAfter(K1);
+ InsertionPt = K2;
+ }
+ }
+
+ // Move all uses of the function I (including pairing-induced uses) after J.
+ void WIVectorize::moveUsesOfIAfterJ(BasicBlock &/*BB*/,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (; cast<Instruction>(L) != J;) {
+ if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+ // Move this instruction
+ Instruction *InstToMove = L; ++L;
+
+ InstToMove->removeFromParent();
+ InstToMove->insertAfter(InsertionPt);
+ InsertionPt = InstToMove;
+ } else {
+ ++L;
+ }
+ }
+ }
+
+
+ // Collect all load instruction that are in the move set of a given first
+ // pair member. These loads depend on the first instruction, I, and so need
+ // to be moved after J (the second instruction) when the pair is fused.
+ void WIVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &/*ChosenPairs*/,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I) {
+ // Skip to the first instruction past I.
+ BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+
+ // Note: We cannot end the loop when we reach J because J could be moved
+ // farther down the use chain by another instruction pairing. Also, J
+ // could be before I if this is an inverted input.
+ for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
+ if (trackUsesOfI(Users, WriteSet, I, L)) {
+ if (L->mayReadFromMemory())
+ LoadMoveSet.insert(ValuePair(L, I));
+ }
+ }
+ }
+
+ // In cases where both load/stores and the computation of their pointers
+ // are chosen for vectorization, we can end up in a situation where the
+ // aliasing analysis starts returning different query results as the
+ // process of fusing instruction pairs continues. Because the algorithm
+ // relies on finding the same use trees here as were found earlier, we'll
+ // need to precompute the necessary aliasing information here and then
+ // manually update it during the fusion process.
+ void WIVectorize::collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet) {
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PIE = PairableInsts.end(); PI != PIE; ++PI) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+ if (P == ChosenPairs.end()) continue;
+
+ Instruction *I = cast<Instruction>(P->first);
+ collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+ }
+ }
+
+ // This function fuses the chosen instruction pairs into vector instructions,
+ // taking care preserve any needed scalar outputs and, then, it reorders the
+ // remaining instructions as needed (users of the first member of the pair
+ // need to be moved to after the location of the second member of the pair
+ // because the vector instruction is inserted in the location of the pair's
+ // second member).
+ void WIVectorize::fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs) {
+ LLVMContext& Context = BB.getContext();
+
+ // During the vectorization process, the order of the pairs to be fused
+ // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+ // list. After a pair is fused, the flipped pair is removed from the list.
+ std::vector<ValuePair> FlippedPairs;
+ FlippedPairs.reserve(ChosenPairs.size());
+ for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+ E = ChosenPairs.end(); P != E; ++P)
+ FlippedPairs.push_back(ValuePair(P->second, P->first));
+ for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+ E = FlippedPairs.end(); P != E; ++P)
+ ChosenPairs.insert(*P);
+
+ std::multimap<Value *, Value *> LoadMoveSet;
+ collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+ DenseSet<Value *> LowPtrInsts;
+ collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
+
+ DEBUG(dbgs() << "WIV: initial: \n" << BB << "\n");
+
+ for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+ DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+ if (P == ChosenPairs.end()) {
+ ++PI;
+ continue;
+ }
+
+ if (getDepthFactor(P->first) == 0) {
+ // These instructions are not really fused, but are tracked as though
+ // they are. Any case in which it would be interesting to fuse them
+ // will be taken care of by InstCombine.
+ --NumFusedOps;
+ ++PI;
+ continue;
+ }
+
+ Instruction *I = cast<Instruction>(P->first),
+ *J = cast<Instruction>(P->second);
+
+ DEBUG(dbgs() << "WIV: fusing: " << *I <<
+ " <-> " << *J << "\n");
+
+ // Remove the pair and flipped pair from the list.
+ DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+ assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+ ChosenPairs.erase(FP);
+ ChosenPairs.erase(P);
+
+ bool FlipMemInputs = false;
+ if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<GetElementPtrInst>(I))
+ FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+ unsigned NumOperands = I->getNumOperands();
+ SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+ getReplacementInputsForPair(Context, I, J, ReplacedOperands,
+ FlipMemInputs);
+
+ // Make a copy of the original operation, change its type to the vector
+ // type and replace its operands with the vector operands.
+ Instruction *K = I->clone();
+ if (I->hasName()) K->takeName(I);
+
+ if (I->getMetadata("wi") != NULL) {
+ K->setMetadata("wi", I->getMetadata("wi"));
+ K->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+ if (!isa<StoreInst>(K))
+ K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+
+ for (unsigned o = 0; o < NumOperands; ++o)
+ K->setOperand(o, ReplacedOperands[o]);
+
+ // If we've flipped the memory inputs, make sure that we take the correct
+ // alignment.
+ if (FlipMemInputs) {
+ if (isa<StoreInst>(K))
+ cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
+ else
+ cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
+ }
+
+ K->insertAfter(J);
+
+ // Instruction insertion point:
+ Instruction *InsertionPt = K;
+ Instruction *K1 = 0, *K2 = 0;
+ replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
+ FlipMemInputs);
+
+ // The use tree of the first original instruction must be moved to after
+ // the location of the second instruction. The entire use tree of the
+ // first instruction is disjoint from the input tree of the second
+ // (by definition), and so commutes with it.
+
+ moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+
+ if (!isa<StoreInst>(I)) {
+ I->replaceAllUsesWith(K1);
+ J->replaceAllUsesWith(K2);
+ AA->replaceWithNewValue(I, K1);
+ AA->replaceWithNewValue(J, K2);
+ }
+
+ // Instructions that may read from memory may be in the load move set.
+ // Once an instruction is fused, we no longer need its move set, and so
+ // the values of the map never need to be updated. However, when a load
+ // is fused, we need to merge the entries from both instructions in the
+ // pair in case those instructions were in the move set of some other
+ // yet-to-be-fused pair. The loads in question are the keys of the map.
+ if (I->mayReadFromMemory()) {
+ std::vector<ValuePair> NewSetMembers;
+ VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
+ VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
+ for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
+ N != IPairRange.second; ++N)
+ NewSetMembers.push_back(ValuePair(K, N->second));
+ for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
+ N != JPairRange.second; ++N)
+ NewSetMembers.push_back(ValuePair(K, N->second));
+ for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+ AE = NewSetMembers.end(); A != AE; ++A)
+ LoadMoveSet.insert(*A);
+ }
+
+ // Before removing I, set the iterator to the next instruction.
+ PI = llvm::next(BasicBlock::iterator(I));
+ if (cast<Instruction>(PI) == J)
+ ++PI;
+
+ SE->forgetValue(I);
+ SE->forgetValue(J);
+ I->eraseFromParent();
+ J->eraseFromParent();
+ }
+
+ DEBUG(dbgs() << "WIV: final: \n" << BB << "\n");
+ }
+ void WIVectorize::dropUnused(BasicBlock& BB) {
+ bool changed;
+ do{
+ BasicBlock::iterator J = BB.end();
+ BasicBlock::iterator I = llvm::prior(J);
+ changed = false;
+ while (I != BB.begin()) {
+
+ if (isa<ShuffleVectorInst>(*I) ||
+ isa<ExtractElementInst>(*I) ||
+ isa<InsertElementInst>(*I) ||
+ isa<BitCastInst>(*I)) {
+
+ Value* V = dyn_cast<Value>(&(*I));
+
+ if (V && V->use_empty()) {
+ SE->forgetValue(&(*I));
+ (*I).eraseFromParent();
+ // removed instruction could have messed up things
+ // start again from the end
+ I = BB.end();
+ J = llvm::prior(I);
+ changed = true;
+ } else {
+ J = llvm::prior(I);
+ }
+ } else {
+ J = llvm::prior(I);
+ }
+ I = J;
+ }
+ } while (changed);
+ }
+
+ // Replace uses of alloca with new alloca.
+ // This includes getelementpointer, bitcast, load and store only
+ // atm.
+ // In case original alloca was array, the getelementpointer and bitcast apply.
+ void WIVectorize::replaceUses(BasicBlock& BB,
+ AllocaInst& oldAlloca,
+ AllocaInst& newAlloca,
+ int indx) {
+
+ LLVMContext& Context = BB.getContext();
+ Instruction::use_iterator useiter = oldAlloca.use_begin();
+
+ while (useiter != oldAlloca.use_end()) {
+ llvm::User* tmp = *useiter;
+
+ if (isa<BitCastInst>(tmp)) {
+ // Create new bitcast from new alloca to same type
+ // as old bitcast had. This is situation where the
+ // alloca is casted to i8* followed by
+ // call void @llvm.lifetime.start(i64 -1, i8* %XYZ) nounwind
+ BitCastInst* bitCast = cast<BitCastInst>(tmp);
+ IRBuilder<> builder(bitCast);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ &newAlloca, bitCast->getDestTy(), bitCast->getName()));
+
+ if (bitCast->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", bitCast->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", bitCast->getMetadata("wi_counter"));
+ }
+
+ bitCast->replaceAllUsesWith(newBitcast);
+ AA->replaceWithNewValue(bitCast, newBitcast);
+ SE->forgetValue(bitCast);
+ bitCast->eraseFromParent();
+
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+
+ if (isa<GetElementPtrInst>(tmp)) {
+ // Original getelementpointer contains number of indexes
+ // that indicate how to access element of allocated
+ // memory. Since we changed the most inner type to
+ // array, we add index to that array such as:
+ // Original alloca:
+ // %A = alloca [20 x [8 x i32]], align 4
+ // Original getelementpointer:
+ // %68 = getelementptr inbounds [20 x [8 x i32]]]* %A, i32 0, i32 %X, i32 0
+ // New alloca:
+ // %A = alloca [20 x [8 x [2 x i32]]], align 4
+ // new getelementpointer:
+ // %68 = getelementptr inbounds [20 x [8 x [2 x i32]]]* %A, i32 0, i32 %X, i32 0, i32 0
+
+ GetElementPtrInst* gep = cast<GetElementPtrInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ // Collect original indexes of getelementpointer
+ for (unsigned int i = 1; i <= gep->getNumIndices(); i++) {
+ gepArgs.push_back(gep->getOperand(i));
+ }
+ // Add index to the newly created array
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(gep);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(
+ builder.CreateGEP(&newAlloca, gepArgs, gep->getName()));
+ newGep->setIsInBounds(gep->isInBounds());
+
+ if (gep->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", gep->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", gep->getMetadata("wi_counter"));
+ }
+
+ gep->replaceAllUsesWith(newGep);
+ AA->replaceWithNewValue(gep, newGep);
+ SE->forgetValue(gep);
+ gep->eraseFromParent();
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ if (isa<StoreInst>(tmp)) {
+ // This is tricky, original alloca was for base type such
+ // as i32 or float so the variable was used directly.
+ // Now this is array so we have to add getelementpointer.
+ StoreInst* store = cast<StoreInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(store);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs));
+ if (store->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", store->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", store->getMetadata("wi_counter"));
+ }
+
+ for (unsigned int i = 0; i < store->getNumOperands(); i++) {
+ // Either of store operands could be alloca, we either
+ // store to allocated memory, or we are storing the pointer
+ // of the memory (this is rather dumb thing to do).
+ if (store->getOperand(i) == &oldAlloca) {
+ IRBuilder<> builder(store);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ newGep, store->getOperand(i)->getType()));
+ if (store->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", store->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", store->getMetadata("wi_counter"));
+ }
+ store->setOperand(i, newBitcast);
+ }
+ }
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ if (isa<LoadInst>(tmp)) {
+ // This is tricky, original alloca was for base type such
+ // as i32 or float so the variable was used directly.
+ // Now this is array so we have to add getelementpointer.
+
+ LoadInst* load = cast<LoadInst>(tmp);
+ std::vector<llvm::Value *> gepArgs;
+ Value *V = ConstantInt::get(Type::getInt32Ty(Context), indx);
+ gepArgs.push_back(V);
+ IRBuilder<> builder(load);
+ GetElementPtrInst* newGep =
+ cast<GetElementPtrInst>(builder.CreateGEP(&newAlloca, gepArgs));
+ if (load->getMetadata("wi") != NULL) {
+ newGep->setMetadata("wi", load->getMetadata("wi"));
+ newGep->setMetadata("wi_counter", load->getMetadata("wi_counter"));
+ }
+
+ for (unsigned int i = 0; i < load->getNumOperands(); i++) {
+ // Find operand of load that was old alloca and
+ // use bitcast to point to to getelementpointer result.
+ // There must be better way how to do this.
+ if (load->getOperand(i) == &oldAlloca) {
+ IRBuilder<> builder(load);
+ BitCastInst* newBitcast =
+ cast<BitCastInst>(builder.CreateBitCast(
+ newGep, load->getOperand(i)->getType()));
+ if (load->getMetadata("wi") != NULL) {
+ newBitcast->setMetadata("wi", load->getMetadata("wi"));
+ newBitcast->setMetadata("wi_counter", load->getMetadata("wi_counter"));
+ }
+ load->setOperand(i, newBitcast);
+ }
+ }
+ useiter = oldAlloca.use_begin();
+ continue;
+ }
+ useiter++;
+ }
+ }
+
+ // Find new type for the vector alloca instruction
+ Type* WIVectorize::newAllocaType(Type* start, unsigned int width) {
+
+ if (start->isArrayTy()) {
+ // If type is still array check what is allocated type
+ int numElm = cast<ArrayType>(start)->getNumElements();
+ return ArrayType::get(
+ newAllocaType(
+ cast<SequentialType>(start)->getElementType(),
+ width)
+ , numElm);
+ } else if (start->isFirstClassType() && !start->isPointerTy()) {
+ // Recursion stopping point
+ // This should convert i32 to [width x i32] as base type of
+ // array
+ return ArrayType::get(start, width);
+ } else {
+ // Not recognized type, just return it, alloca won't be replaced
+ return start;
+ }
+ }
+
+ // In case there is private variable in the kernel that does not fit into
+ // register (multidimensional array for example), there are alloca
+ // defined to create necessary memory space for variable.
+ // Those are defined then for each of the work items replicated.
+ // This pass attempts to combine those allocas to create 'interleaved'
+ // memory allocation that then can be accessed by vector loads and stores
+ // as described bellow:
+ //
+ // __kernel xyz() {
+ //
+ // int A[100][100][100][100];
+ // ...
+ //}
+ // Will become after replication with 2 work items:
+ //
+ // %A = alloca [100 x [100 x [100 x i32]]], align 4
+ // %A_wi_1_0_0 = alloca [100 x [100 x [100 x i32]]], align 4
+ //
+ // This in will be converted here to :
+ // %A = alloca [100 x [100 x [100 x [2 x i32]]]], align 4
+ // And respective getelementpointer instruction will
+ // be added additional paramter to select correct member from the pair.
+ //
+ // NOTE: This does work only for arrays ATM, the scalar type allocas
+ // as produced by phistoallocas pass required for the work loops
+ // are skipped for now.
+
+ bool WIVectorize::vectorizeAllocas(BasicBlock& BB) {
+
+ std::multimap<int, ValueVector*> allocas;
+ getCandidateAllocas(BB, allocas);
+ bool changed = false;
+
+ for (std::multimap<int, ValueVector*>::iterator insIt = allocas.begin();
+ insIt != allocas.end(); insIt++) {
+ IRBuilder<> builder(
+ BB.getParent()->getEntryBlock().getFirstInsertionPt());
+
+ ValueVector* tmpVec = (*insIt).second;
+ // Create as 'wide' alloca as number of elements found,
+ // could be smaller then vector width or larger.
+ // Should be same as work group dimensions for work item replicas or
+ // same as number of unrolled loops with work item loops.
+ unsigned int allocaWidth = tmpVec->size();
+ // No point vectorizing one alloca only
+ if (allocaWidth <= 1)
+ continue;
+
+ AllocaInst* I = cast<AllocaInst>((*tmpVec)[0]);
+ Type* startType = I->getAllocatedType();
+ if (!startType->isArrayTy())
+ continue;
+ // Find new type for alloca by recursively searching through multiple
+ // dimensions of array
+ Type* newType = newAllocaType(startType, allocaWidth);
+
+ // No new type was found, alloca type not supported.
+ if (newType == startType)
+ continue;
+
+ changed = true;
+ llvm::AllocaInst *alloca =
+ builder.CreateAlloca(newType, 0, I->getName().str() + "_allocamix");
+ alloca->setAlignment(I->getAlignment());
+
+ if (I->getMetadata("wi") != NULL) {
+ alloca->setMetadata("wi", I->getMetadata("wi"));
+ alloca->setMetadata("wi_counter", I->getMetadata("wi_counter"));
+ }
+
+ // Replace uses of first alloca with newly created one
+ MDNode* mi = I->getMetadata("wi");
+ assert(mi->getNumOperands() == 3);
+ // Second operand of MDNode contains MDNode with XYZ tripplet.
+ MDNode* iXYZ= dyn_cast<MDNode>(mi->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+
+ int index = dyn_cast<ConstantInt>(iXYZ->getOperand(1))->getZExtValue();
+
+ replaceUses(BB, *I, *alloca, index);
+ SE->forgetValue(I);
+ I->eraseFromParent();
+
+ // Replaces uses of other allocas with newly created one
+ for (unsigned int i = 1; i < allocaWidth; i++) {
+ AllocaInst* J = cast<AllocaInst>((*tmpVec)[i]);
+ MDNode* mj = J->getMetadata("wi");
+ assert(mj->getNumOperands() == 3);
+ MDNode* jXYZ= dyn_cast<MDNode>(mj->getOperand(2));
+ assert(jXYZ->getNumOperands() == 4);
+ int index =
+ dyn_cast<ConstantInt>(jXYZ->getOperand(1))->getZExtValue();
+
+ replaceUses(BB, *J, *alloca, index);
+ SE->forgetValue(J);
+ J->eraseFromParent();
+ }
+ }
+ return changed;
+ }
+
+ // Pass closely repated to getCandidatePairs, except this one only
+ // picks AllocaInst and makes sure they are from different work items.
+ // It also returns all instances of AllocaInst at the same time.
+ bool WIVectorize::getCandidateAllocas(BasicBlock &BB,
+ std::multimap<int, ValueVector*>& temporary) {
+
+ BasicBlock::iterator Start = BB.getFirstInsertionPt();
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = Start++; I != E; ++I) {
+
+ if (!isa<AllocaInst>(I))
+ continue;
+ // TODO: This is bit tricky, should it be possible
+ // to create vector of allocas that do not have metadata?
+ if (I->getMetadata("wi") == NULL)
+ continue;
+
+ MDNode* md = I->getMetadata("wi");
+ MDNode* mdCounter = I->getMetadata("wi_counter");
+ MDNode* mdRegion = dyn_cast<MDNode>(md->getOperand(1));
+
+ unsigned CI = cast<ConstantInt>(mdCounter->getOperand(1))->getZExtValue();
+ unsigned RI = cast<ConstantInt>(mdRegion->getOperand(1))->getZExtValue();
+
+ std::multimap<int,ValueVector*>::iterator itb = temporary.lower_bound(CI);
+ std::multimap<int,ValueVector*>::iterator ite = temporary.upper_bound(CI);
+ ValueVector* tmpVec = NULL;
+ while(itb != ite) {
+ if (I->isSameOperationAs(cast<Instruction>((*(*itb).second)[0]))) {
+ // Test also if instructions are from same region.
+ MDNode* tmpMD =
+ cast<Instruction>((*(*itb).second)[0])->getMetadata("wi");
+ MDNode* tmpRINode = dyn_cast<MDNode>(tmpMD->getOperand(1));
+ unsigned tmpRI =
+ cast<ConstantInt>(tmpRINode->getOperand(1))->getZExtValue();
+ if (RI == tmpRI)
+ tmpVec = (*itb).second;
+ }
+ itb++;
+ }
+ if (tmpVec == NULL) {
+ tmpVec = new ValueVector;
+ temporary.insert(std::pair<int, ValueVector*>(CI, tmpVec));
+ }
+ tmpVec->push_back(I);
+ }
+ for (std::multimap<int, ValueVector*>::iterator insIt = temporary.begin();
+ insIt != temporary.end(); insIt++) {
+ ValueVector* tmpVec = (*insIt).second;
+ for (unsigned j = 0; j < tmpVec->size()/2; j++) {
+ Instruction* I = cast<Instruction>((*tmpVec)[2*j]);
+ Instruction* J = cast<Instruction>((*tmpVec)[2*j+1]);
+ if (!areInstsCompatibleFromDifferentWi(I,J))
+ continue;
+ }
+ }
+ return true;
+ }
+
+}
+char WIVectorize::ID = 0;
+RegisterPass<WIVectorize>
+ X("wi-vectorize", "Work item vectorization.");
+
+FunctionPass *createWIVectorizePass() {
+ return new WIVectorize();
+}
+
diff --git a/src/llvmopencl/WorkItemAliasAnalysis.cc b/src/llvmopencl/WorkItemAliasAnalysis.cc
new file mode 100644
index 0000000..1d1fba7
--- /dev/null
+++ b/src/llvmopencl/WorkItemAliasAnalysis.cc
@@ -0,0 +1,119 @@
+/*
+ Copyright (c) 2012 Tampere University of Technology.
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * @file WorkItemAliasAnalysis.cc
+ *
+ * Definition of WorkItemAliasAnalysis class.
+ *
+ * @author Vladimír Guzma 2012
+ */
+
+#include "WorkItemAliasAnalysis.h"
+using namespace pocl;
+
+// Register this pass...
+char WorkItemAliasAnalysis::ID = 0;
+RegisterPass<WorkItemAliasAnalysis>
+ X("wi-aa", "Work item alias analysis.", false, false);
+// Register it also to pass group
+RegisterAnalysisGroup<AliasAnalysis> Y(X);
+
+ImmutablePass *createWorkItemAliasAnalysisPass() {
+ return new WorkItemAliasAnalysis();
+}
+
+extern "C" {
+ ImmutablePass*
+ create_workitem_aa_plugin() {
+ return new WorkItemAliasAnalysis();
+ }
+}
+
+void
+WorkItemAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AliasAnalysis::getAnalysisUsage(AU);
+}
+
+/**
+ * Test if memory locations are from different work items from same region.
+ * Then they can not alias.
+ */
+AliasAnalysis::AliasResult
+WorkItemAliasAnalysis::alias(const Location &LocA,
+ const Location &LocB) {
+ // If either of the memory references is empty, it doesn't matter what the
+ // pointer values are. This allows the code below to ignore this special
+ // case.
+ if (LocA.Size == 0 || LocB.Size == 0)
+ return NoAlias;
+
+ // Pointers from different address spaces do not alias
+ if (cast<PointerType>(LocA.Ptr->getType())->getAddressSpace() !=
+ cast<PointerType>(LocB.Ptr->getType())->getAddressSpace()) {
+ return NoAlias;
+ }
+ // In case code is created by pocl, we can also use metadata.
+ if (isa<Instruction>(LocA.Ptr) && isa<Instruction>(LocB.Ptr)) {
+ const Instruction* valA = dyn_cast<Instruction>(LocA.Ptr);
+ const Instruction* valB = dyn_cast<Instruction>(LocB.Ptr);
+ if (valA->getMetadata("wi") && valB->getMetadata("wi")) {
+ const MDNode* mdA = valA->getMetadata("wi");
+ const MDNode* mdB = valB->getMetadata("wi");
+ // Compare region ID. If they are same, different work items
+ // imply no aliasing. If regions are different or work items
+ // are same anything can happen.
+ // Fall back to other AAs.
+ const MDNode* mdRegionA = dyn_cast<MDNode>(mdA->getOperand(1));
+ const MDNode* mdRegionB = dyn_cast<MDNode>(mdB->getOperand(1));
+ ConstantInt* C1 = dyn_cast<ConstantInt>(mdRegionA->getOperand(1));
+ ConstantInt* C2 = dyn_cast<ConstantInt>(mdRegionB->getOperand(1));
+ if (C1->getValue() == C2->getValue()) {
+ // Now we have both locations from same region. Check for different
+ // work items.
+ MDNode* iXYZ= dyn_cast<MDNode>(mdA->getOperand(2));
+ MDNode* jXYZ= dyn_cast<MDNode>(mdB->getOperand(2));
+ assert(iXYZ->getNumOperands() == 4);
+ assert(jXYZ->getNumOperands() == 4);
+
+ ConstantInt *CIX = dyn_cast<ConstantInt>(iXYZ->getOperand(1));
+ ConstantInt *CJX = dyn_cast<ConstantInt>(jXYZ->getOperand(1));
+
+ ConstantInt *CIY = dyn_cast<ConstantInt>(iXYZ->getOperand(2));
+ ConstantInt *CJY = dyn_cast<ConstantInt>(jXYZ->getOperand(2));
+
+ ConstantInt *CIZ = dyn_cast<ConstantInt>(iXYZ->getOperand(3));
+ ConstantInt *CJZ = dyn_cast<ConstantInt>(jXYZ->getOperand(3));
+
+ if ( !(CIX->getValue() == CJX->getValue()
+ && CIY->getValue() == CJY->getValue()
+ && CIZ->getValue() == CJZ->getValue())) {
+ return NoAlias;
+ }
+ }
+ }
+ }
+
+ // Forward the query to the next analysis.
+ return AliasAnalysis::alias(LocA, LocB);
+}
diff --git a/src/llvmopencl/WorkItemAliasAnalysis.h b/src/llvmopencl/WorkItemAliasAnalysis.h
new file mode 100644
index 0000000..5c07a02
--- /dev/null
+++ b/src/llvmopencl/WorkItemAliasAnalysis.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (c) 2012 Tampere University of Technology.
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * @file WorkItemAliasAnalysis.cc
+ *
+ * Definition of WorkItemAliasAnalysis class.
+ *
+ * @author Vladimír Guzma 2012
+ */
+
+#include "config.h"
+#include <iostream>
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Pass.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#endif
+
+using namespace llvm;
+
+namespace pocl {
+/// WorkItemAliasAnalysis - This is a simple alias analysis
+/// implementation that uses pocl metadata to make sure memory accesses from
+/// different work items are not aliasing.
+class WorkItemAliasAnalysis : public llvm::ImmutablePass, public llvm::AliasAnalysis {
+public:
+ static char ID;
+ WorkItemAliasAnalysis() : ImmutablePass(ID) {}
+
+ /// getAdjustedAnalysisPointer - This method is used when a pass implements
+ /// an analysis interface through multiple inheritance. If needed, it
+ /// should override this to adjust the this pointer as needed for the
+ /// specified pass info.
+ virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+ if (PI == &AliasAnalysis::ID)
+ return (AliasAnalysis*)this;
+ return this;
+ }
+ virtual void initializePass() {
+ InitializeAliasAnalysis(this);
+ }
+
+ private:
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+ virtual AliasResult alias(const Location &LocA, const Location &LocB);
+
+ };
+}
+
diff --git a/src/llvmopencl/Workgroup.cc b/src/llvmopencl/Workgroup.cc
new file mode 100644
index 0000000..85cd84f
--- /dev/null
+++ b/src/llvmopencl/Workgroup.cc
@@ -0,0 +1,619 @@
+// LLVM module pass to create the single function (fully inlined)
+// and parallelized kernel for an OpenCL workgroup.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "Workgroup.h"
+
+#include "CanonicalizeBarriers.h"
+#include "BarrierTailReplication.h"
+#include "WorkitemReplication.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstdio>
+#include <map>
+#include <iostream>
+
+#include "pocl.h"
+
+#define STRING_LENGTH 32
+
+using namespace std;
+using namespace llvm;
+using namespace pocl;
+
+static void noaliasArguments(Function *F);
+static Function *createLauncher(Module &M, Function *F);
+static void privatizeContext(Module &M, Function *F);
+static void createWorkgroup(Module &M, Function *F);
+static void createWorkgroupFast(Module &M, Function *F);
+
+// extern cl::opt<string> Header;
+// extern cl::list<int> LocalSize;
+
+/* The kernel to process in this kernel compiler launch. */
+cl::opt<string>
+KernelName("kernel",
+ cl::desc("Kernel function name"),
+ cl::value_desc("kernel"),
+ cl::init(""));
+
+namespace llvm {
+
+ typedef struct _pocl_context PoclContext;
+
+ template<bool xcompile> class TypeBuilder<PoclContext, xcompile> {
+ public:
+ static StructType *get(LLVMContext &Context) {
+ if (size_t_width == 64)
+ {
+ return StructType::get
+ (TypeBuilder<types::i<32>, xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<64>[3], xcompile>::get(Context),
+ NULL);
+ }
+ else if (size_t_width == 32)
+ {
+ return StructType::get
+ (TypeBuilder<types::i<32>, xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ TypeBuilder<types::i<32>[3], xcompile>::get(Context),
+ NULL);
+ }
+ else
+ {
+ assert (false && "Unsupported size_t width.");
+ }
+ }
+
+ /**
+ * We compile for various targets with various widths for the size_t
+ * type that depends on the pointer type.
+ *
+ * This should be set when the correct type is known. This is a hack
+ * until a better way is found. It's not thread safe, e.g. if one
+ * compiles multiple Modules for multiple different pointer widths in
+ * a same process with multiple threads. */
+ static void setSizeTWidth(int width) {
+ size_t_width = width;
+ }
+
+ enum Fields {
+ WORK_DIM,
+ NUM_GROUPS,
+ GROUP_ID,
+ GLOBAL_OFFSET
+ };
+ private:
+ static int size_t_width;
+
+ };
+
+ template<bool xcompile>
+ int TypeBuilder<PoclContext, xcompile>::size_t_width = 0;
+
+} // namespace llvm
+
+char Workgroup::ID = 0;
+static RegisterPass<Workgroup> X("workgroup", "Workgroup creation pass");
+
+
+bool
+Workgroup::runOnModule(Module &M)
+{
+ if (M.getPointerSize() == llvm::Module::Pointer64)
+ {
+ TypeBuilder<PoclContext, true>::setSizeTWidth(64);
+ }
+ else if (M.getPointerSize() == llvm::Module::Pointer32)
+ {
+ TypeBuilder<PoclContext, true>::setSizeTWidth(32);
+ }
+ else
+ {
+ assert (false && "Target has an unsupported pointer width.");
+ }
+
+ for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+ if (!i->isDeclaration())
+ i->setLinkage(Function::InternalLinkage);
+ }
+
+ for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+ if (!isKernelToProcess(*i)) continue;
+ Function *L = createLauncher(M, i);
+
+#if defined LLVM_3_2
+ L->addFnAttr(Attributes::NoInline);
+#else
+ L->addFnAttr(Attribute::NoInline);
+#endif
+
+ privatizeContext(M, L);
+
+ createWorkgroup(M, L);
+ createWorkgroupFast(M, L);
+ }
+
+ Function *barrier = cast<Function>
+ (M.getOrInsertFunction("barrier",
+ Type::getVoidTy(M.getContext()),
+ Type::getInt32Ty(M.getContext()),
+ NULL));
+
+ BasicBlock *bb = BasicBlock::Create(M.getContext(), "", barrier);
+ ReturnInst::Create(M.getContext(), 0, bb);
+
+ return true;
+}
+
+/**
+ * Marks the pointer arguments to the kernel functions as noalias.
+ */
+static void
+noaliasArguments(Function *F)
+{
+ for (unsigned i = 0, e = F->getFunctionType()->getNumParams(); i < e; ++i)
+ if (isa<PointerType> (F->getFunctionType()->getParamType(i)))
+ F->setDoesNotAlias(i + 1); // arg 0 is return type
+}
+
+static Function *
+createLauncher(Module &M, Function *F)
+{
+ SmallVector<Type *, 8> sv;
+
+ for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+ i != e; ++i)
+ sv.push_back (i->getType());
+ sv.push_back(TypeBuilder<PoclContext*, true>::get(M.getContext()));
+
+ FunctionType *ft = FunctionType::get(Type::getVoidTy(M.getContext()),
+ ArrayRef<Type *> (sv),
+ false);
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+
+ Function *L = Function::Create(ft,
+ Function::ExternalLinkage,
+ "_" + funcName,
+ &M);
+
+ SmallVector<Value *, 8> arguments;
+ Function::arg_iterator ai = L->arg_begin();
+ for (unsigned i = 0, e = F->getArgumentList().size(); i != e; ++i) {
+ arguments.push_back(ai);
+ ++ai;
+ }
+
+ /* Copy the function attributes to transfer noalias etc. from the
+ original kernel which will be inlined into the launcher. */
+ L->setAttributes(F->getAttributes());
+
+ Value *ptr, *v;
+ char s[STRING_LENGTH];
+ GlobalVariable *gv;
+
+ IRBuilder<> builder(BasicBlock::Create(M.getContext(), "", L));
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::WORK_DIM);
+ gv = M.getGlobalVariable("_work_dim");
+ if (gv != NULL) {
+ v = builder.CreateLoad(builder.CreateConstGEP1_32(ptr, 0));
+ builder.CreateStore(v, gv);
+ }
+
+
+ int size_t_width = 32;
+ if (M.getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::GROUP_ID);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::NUM_GROUPS);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ ptr = builder.CreateStructGEP(ai,
+ TypeBuilder<PoclContext, true>::GLOBAL_OFFSET);
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i);
+ gv = M.getGlobalVariable(s);
+ if (gv != NULL) {
+ if (size_t_width == 64)
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_64(ptr, 0, i));
+ }
+ else
+ {
+ v = builder.CreateLoad(builder.CreateConstGEP2_32(ptr, 0, i));
+ }
+ builder.CreateStore(v, gv);
+ }
+ }
+
+ CallInst *c = builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+
+ InlineFunctionInfo IFI;
+ InlineFunction(c, IFI);
+
+ return L;
+}
+
+static void
+privatizeContext(Module &M, Function *F)
+{
+ char s[STRING_LENGTH];
+ GlobalVariable *gv[3];
+ AllocaInst *ai[3] = {NULL, NULL, NULL};
+
+ IRBuilder<> builder(F->getEntryBlock().getFirstNonPHI());
+
+ // Privatize _local_id
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_local_id_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _local_size
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_local_size_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _work_dim
+ gv[0] = M.getGlobalVariable("_work_dim");
+ if (gv[0] != NULL) {
+ ai[0] = builder.CreateAlloca(gv[0]->getType()->getElementType(),
+ 0, "_work_dim");
+ if(gv[0]->hasInitializer()) {
+ Constant *c = gv[0]->getInitializer();
+ builder.CreateStore(c, ai[0]);
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ ii->replaceUsesOfWith(gv[0], ai[0]);
+ }
+ }
+
+ // Privatize _num_groups
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_num_groups_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _group_id
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_group_id_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+
+ // Privatize _global_offset
+ for (int i = 0; i < 3; ++i) {
+ snprintf(s, STRING_LENGTH, "_global_offset_%c", 'x' + i);
+ gv[i] = M.getGlobalVariable(s);
+ if (gv[i] != NULL) {
+ ai[i] = builder.CreateAlloca(gv[i]->getType()->getElementType(),
+ 0, s);
+ if(gv[i]->hasInitializer()) {
+ Constant *c = gv[i]->getInitializer();
+ builder.CreateStore(c, ai[i]);
+ }
+ }
+ }
+ for (Function::iterator i = F->begin(), e = F->end(); i != e; ++i) {
+ for (BasicBlock::iterator ii = i->begin(), ee = i->end();
+ ii != ee; ++ii) {
+ for (int j = 0; j < 3; ++j)
+ ii->replaceUsesOfWith(gv[j], ai[j]);
+ }
+ }
+}
+
+/**
+ * Creates a work group launcher function (called KERNELNAME_workgroup)
+ * that assumes kernel pointer arguments are stored as pointers to the
+ * actual buffers and that scalar data is loaded from the default memory.
+ */
+static void
+createWorkgroup(Module &M, Function *F)
+{
+ IRBuilder<> builder(M.getContext());
+
+ FunctionType *ft =
+ TypeBuilder<void(types::i<8>*[],
+ PoclContext*), true>::get(M.getContext());
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+
+ Function *workgroup =
+ dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup", ft));
+ assert(workgroup != NULL);
+
+ builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup));
+
+ Function::arg_iterator ai = workgroup->arg_begin();
+
+ SmallVector<Value*, 8> arguments;
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+ ii != ee; ++ii) {
+ Type *t = ii->getType();
+
+ Value *gep = builder.CreateGEP(ai,
+ ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
+ Value *pointer = builder.CreateLoad(gep);
+
+ /* If it's a pass by value pointer argument, we just pass the pointer
+ * as is to the function, no need to load form it first. */
+ Value *value;
+ if (ii->hasByValAttr()) {
+ value = builder.CreateBitCast(pointer, t);
+ } else {
+ value = builder.CreateBitCast(pointer, t->getPointerTo());
+ value = builder.CreateLoad(value);
+ }
+
+ arguments.push_back(value);
+ ++i;
+ }
+
+ arguments.back() = ++ai;
+
+ builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+}
+
+/**
+ * Creates a work group launcher more suitable for the heterogeneous
+ * host-device setup (called KERNELNAME_workgroup_fast).
+ *
+ * 1) Pointer arguments are stored directly as pointers to the
+ * buffers in the argument buffer.
+ *
+ * 2) Scalar values are loaded from the global memory address
+ * space.
+ *
+ * This should minimize copying of data and memory allocation
+ * at the device.
+ */
+static void
+createWorkgroupFast(Module &M, Function *F)
+{
+ IRBuilder<> builder(M.getContext());
+
+ FunctionType *ft =
+ TypeBuilder<void(types::i<8>*[],
+ PoclContext*), true>::get(M.getContext());
+
+ std::string funcName = "";
+ funcName = F->getName().str();
+ Function *workgroup =
+ dyn_cast<Function>(M.getOrInsertFunction(funcName + "_workgroup_fast", ft));
+ assert(workgroup != NULL);
+
+ builder.SetInsertPoint(BasicBlock::Create(M.getContext(), "", workgroup));
+
+ Function::arg_iterator ai = workgroup->arg_begin();
+
+ SmallVector<Value*, 8> arguments;
+ int i = 0;
+ for (Function::const_arg_iterator ii = F->arg_begin(), ee = F->arg_end();
+ ii != ee; ++i, ++ii) {
+ Type *t = ii->getType();
+ Value *gep = builder.CreateGEP(ai,
+ ConstantInt::get(IntegerType::get(M.getContext(), 32), i));
+ Value *pointer = builder.CreateLoad(gep);
+ Value *bc = NULL;
+
+ if (t->isPointerTy()) {
+ if (!ii->hasByValAttr()) {
+ /* Assume the pointer is directly in the arg array. */
+ arguments.push_back(builder.CreateBitCast(pointer, t));
+ continue;
+ }
+
+ /* It's a pass by value pointer argument, use the underlying
+ * element type in subsequent load. */
+ t = t->getPointerElementType();
+ }
+
+ /* Assume the pointer points to data in the global memory space. */
+ bc = builder.CreateBitCast(pointer,
+ t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL));
+
+ /* If it's a pass by value pointer argument, we just pass the pointer
+ * as is to the function, no need to load from it first. */
+ Value *value = builder.CreateBitCast(
+ pointer, t->getPointerTo(POCL_ADDRESS_SPACE_GLOBAL));
+ if (!ii->hasByValAttr()) {
+ value = builder.CreateLoad(value);
+ }
+
+ arguments.push_back(value);
+ }
+
+ arguments.back() = ++ai;
+
+ builder.CreateCall(F, ArrayRef<Value*>(arguments));
+ builder.CreateRetVoid();
+}
+
+
+/**
+ * Returns true in case the given function is a kernel that
+ * should be processed by the kernel compiler.
+ */
+bool
+Workgroup::isKernelToProcess(const Function &F)
+{
+ const Module *m = F.getParent();
+
+ NamedMDNode *kernels = m->getNamedMetadata("opencl.kernels");
+ if (kernels == NULL) {
+ if (KernelName == "")
+ return true;
+ if (F.getName() == KernelName)
+ return true;
+
+ return false;
+ }
+
+ for (unsigned i = 0, e = kernels->getNumOperands(); i != e; ++i) {
+ if (kernels->getOperand(i)->getOperand(0) == NULL)
+ continue; // globaldce might have removed uncalled kernels
+ Function *k = cast<Function>(kernels->getOperand(i)->getOperand(0));
+ if (&F == k)
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/llvmopencl/Workgroup.h b/src/llvmopencl/Workgroup.h
new file mode 100644
index 0000000..26d7bfd
--- /dev/null
+++ b/src/llvmopencl/Workgroup.h
@@ -0,0 +1,48 @@
+// Header for Workgroup.cc module pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKGROUP_H
+#define _POCL_WORKGROUP_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Module.h"
+#endif
+#include "llvm/Pass.h"
+
+namespace pocl {
+ class Workgroup : public llvm::ModulePass {
+ public:
+ static char ID;
+
+ Workgroup() : ModulePass(ID) {}
+
+ virtual bool runOnModule(llvm::Module &M);
+
+ static bool isKernelToProcess(const llvm::Function &F);
+
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemHandler.cc b/src/llvmopencl/WorkitemHandler.cc
new file mode 100644
index 0000000..90ed294
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandler.cc
@@ -0,0 +1,278 @@
+// LLVM function pass to replicate the kernel body for all work items
+// in a work group.
+//
+// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and
+// Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+#include <sstream>
+#include <iostream>
+
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Metadata.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Support/CommandLine.h"
+#include "WorkitemHandler.h"
+#include "Kernel.h"
+
+//#define DEBUG_REFERENCE_FIXING
+
+namespace pocl {
+
+using namespace llvm;
+
+cl::opt<bool>
+AddWIMetadata("add-wi-metadata", cl::init(false), cl::Hidden,
+ cl::desc("Adds a work item identifier to each of the instruction in work items."));
+
+
+WorkitemHandler::WorkitemHandler(char& ID) : FunctionPass(ID)
+{
+}
+
+bool
+WorkitemHandler::runOnFunction(Function &F)
+{
+ return false;
+}
+
+void
+WorkitemHandler::Initialize(Kernel *K)
+{
+ llvm::Module *M = K->getParent();
+
+ LocalSizeX = 3;
+ LocalSizeY = 1;
+ LocalSizeZ = 1;
+
+// TODO: are we searching reqd_workgroup_size here? If so, we need to enforce it.
+ llvm::NamedMDNode *size_info = M->getNamedMetadata("opencl.kernel_wg_size_info");
+ if (size_info) {
+ for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
+ llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
+ if (KernelSizeInfo->getOperand(0) == K) {
+ LocalSizeX = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(1)))->getLimitedValue();
+ LocalSizeY = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(2)))->getLimitedValue();
+ LocalSizeZ = (llvm::cast<ConstantInt>(KernelSizeInfo->getOperand(3)))->getLimitedValue();
+ }
+ }
+ }
+
+ llvm::Type *localIdType;
+ if (M->getPointerSize() == llvm::Module::Pointer64)
+ size_t_width = 64;
+ else if (M->getPointerSize() == llvm::Module::Pointer32)
+ size_t_width = 32;
+ else
+ assert (false && "Only 32 and 64 bit size_t widths supported.");
+
+ localIdType = IntegerType::get(K->getContext(), size_t_width);
+
+ localIdZ = M->getOrInsertGlobal(POCL_LOCAL_ID_Z_GLOBAL, localIdType);
+ localIdY = M->getOrInsertGlobal(POCL_LOCAL_ID_Y_GLOBAL, localIdType);
+ localIdX = M->getOrInsertGlobal(POCL_LOCAL_ID_X_GLOBAL, localIdType);
+
+ GlobalVariable *gvx = M->getNamedGlobal(POCL_LOCAL_ID_X_GLOBAL);
+ GlobalVariable *gvy = M->getNamedGlobal(POCL_LOCAL_ID_Y_GLOBAL);
+ GlobalVariable *gvz = M->getNamedGlobal(POCL_LOCAL_ID_Z_GLOBAL);
+ gvx->setSection(StringRef("far"));
+ gvy->setSection(StringRef("far"));
+ gvz->setSection(StringRef("far"));
+
+ //Value *lsx = M->getOrInsertGlobal("_local_size_x", localIdType);
+ //Value *lsy = M->getOrInsertGlobal("_local_size_y", localIdType);
+ //Value *lsz = M->getOrInsertGlobal("_local_size_z", localIdType);
+ //GlobalVariable *gsx = M->getNamedGlobal("_local_size_x");
+ //GlobalVariable *gsy = M->getNamedGlobal("_local_size_y");
+ //GlobalVariable *gsz = M->getNamedGlobal("_local_size_z");
+ //gsx->setSection(StringRef("far"));
+ //gsy->setSection(StringRef("far"));
+ //gsz->setSection(StringRef("far"));
+}
+
+bool
+WorkitemHandler::dominatesUse
+(llvm::DominatorTree *DT, Instruction &I, unsigned i) {
+ Instruction *Op = cast<Instruction>(I.getOperand(i));
+ BasicBlock *OpBlock = Op->getParent();
+ PHINode *PN = dyn_cast<PHINode>(&I);
+
+ // DT can handle non phi instructions for us.
+ if (!PN)
+ {
+ // Definition must dominate use unless use is unreachable!
+ return Op->getParent() == I.getParent() ||
+ DT->dominates(Op, &I);
+ }
+
+ // PHI nodes are more difficult than other nodes because they actually
+ // "use" the value in the predecessor basic blocks they correspond to.
+ unsigned j = PHINode::getIncomingValueNumForOperand(i);
+ BasicBlock *PredBB = PN->getIncomingBlock(j);
+ return (PredBB && DT->dominates(OpBlock, PredBB));
+}
+
+/* Fixes the undominated variable uses.
+
+ These appear when a conditional barrier kernel is replicated to
+ form a copy of the *same basic block* in the alternative
+ "barrier path".
+
+ E.g., from
+
+ A -> [exit], A -> B -> [exit]
+
+ a replicated CFG as follows, is created:
+
+ A1 -> (T) A2 -> [exit1], A1 -> (F) A2' -> B1, B2 -> [exit2]
+
+ The regions are correct because of the barrier semantics
+ of "all or none". In case any barrier enters the [exit1]
+ from A1, all must (because there's a barrier in the else
+ branch).
+
+ Here at A2 and A2' one creates the same variables.
+ However, B2 does not know which copy
+ to refer to, the ones created in A2 or ones in A2' (correct).
+ The mapping data contains only one possibility, the
+ one that was placed there last. Thus, the instructions in B2
+ might end up referring to the variables defined in A2
+ which do not nominate them.
+
+ The variable references are fixed by exploiting the knowledge
+ of the naming convention of the cloned variables.
+
+ One potential alternative way would be to collect the refmaps per BB,
+ not globally. Then as a final phase traverse through the
+ basic blocks starting from the beginning and propagating the
+ reference data downwards, the data from the new BB overwriting
+ the old one. This should ensure the reachability without
+ the costly dominance analysis.
+*/
+bool
+WorkitemHandler::fixUndominatedVariableUses(llvm::DominatorTree *DT,
+ llvm::Function &F)
+{
+ bool changed = false;
+ DT->runOnFunction(F);
+
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i)
+ {
+ llvm::BasicBlock *bb = i;
+ for (llvm::BasicBlock::iterator ins = bb->begin(), inse = bb->end();
+ ins != inse; ++ins)
+ {
+ for (unsigned opr = 0; opr < ins->getNumOperands(); ++opr)
+ {
+ if (!isa<Instruction>(ins->getOperand(opr))) continue;
+ Instruction *operand = cast<Instruction>(ins->getOperand(opr));
+ if (dominatesUse(DT, *ins, opr))
+ continue;
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### dominance error!" << std::endl;
+ operand->dump();
+ std::cout << "### does not dominate:" << std::endl;
+ ins->dump();
+#endif
+ StringRef baseName;
+ std::pair< StringRef, StringRef > pieces =
+ operand->getName().rsplit('.');
+ if (pieces.second.startswith("pocl_"))
+ baseName = pieces.first;
+ else
+ baseName = operand->getName();
+
+ Value *alternative = NULL;
+
+ unsigned int copy_i = 0;
+ do {
+ std::ostringstream alternativeName;
+ alternativeName << baseName.str();
+ if (copy_i > 0)
+ alternativeName << ".pocl_" << copy_i;
+
+ alternative =
+ F.getValueSymbolTable().lookup(alternativeName.str());
+
+ if (alternative != NULL)
+ {
+ ins->setOperand(opr, alternative);
+ if (dominatesUse(DT, *ins, opr))
+ break;
+ }
+
+ if (copy_i > 10000 && alternative == NULL)
+ break; /* ran out of possibilities */
+ ++copy_i;
+ } while (true);
+
+ if (alternative != NULL)
+ {
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### found the alternative:" << std::endl;
+ alternative->dump();
+#endif
+ changed |= true;
+ } else {
+#ifdef DEBUG_REFERENCE_FIXING
+ std::cout << "### didn't fiund an alternative for" << std::endl;
+ operand->dump();
+ std::cerr << "### BB:" << std::endl;
+ operand->getParent()->dump();
+ std::cerr << "### the user BB:" << std::endl;
+ ins->getParent()->dump();
+#endif
+ std::cerr << "Could not find a dominating alternative variable." << std::endl;
+ abort();
+ }
+ }
+ }
+ }
+ return changed;
+}
+
+/**
+ * Moves the phi nodes in the beginning of the src to the beginning of
+ * the dst.
+ *
+ * MergeBlockIntoPredecessor function from llvm discards the phi nodes
+ * of the replicated BB because it has only one entry.
+ */
+void
+WorkitemHandler::movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst)
+{
+ while (PHINode *PN = dyn_cast<PHINode>(src->begin()))
+ PN->moveBefore(dst->getFirstNonPHI());
+}
+
+
+} // namespace pocl
diff --git a/src/llvmopencl/WorkitemHandler.h b/src/llvmopencl/WorkitemHandler.h
new file mode 100644
index 0000000..6654fa8
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandler.h
@@ -0,0 +1,73 @@
+// Header for WorkitemHandler, a parent class for all implementations of
+// work item handling.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_HANDLER_H
+#define _POCL_WORKITEM_HANDLER_H
+
+#include "config.h"
+#if (defined LLVM_3_1 or defined LLVM_3_2)
+#include "llvm/Function.h"
+#else
+#include "llvm/IR/Function.h"
+#endif
+
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+ class DominatorTree;
+}
+
+namespace pocl {
+ class Workgroup;
+ class Kernel;
+
+ class WorkitemHandler : public llvm::FunctionPass {
+ public:
+
+ WorkitemHandler(char& ID);
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const = 0;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ virtual void Initialize(pocl::Kernel *K);
+
+ protected:
+
+ void movePhiNodes(llvm::BasicBlock* src, llvm::BasicBlock* dst);
+ bool fixUndominatedVariableUses(llvm::DominatorTree *DT, llvm::Function &F);
+ bool dominatesUse(llvm::DominatorTree *DT, llvm::Instruction &I, unsigned i);
+
+ int LocalSizeX, LocalSizeY, LocalSizeZ;
+
+ unsigned size_t_width;
+
+ /* The global variables that store the current local id. */
+ llvm::Value *localIdZ, *localIdY, *localIdX;
+
+ };
+
+ extern llvm::cl::opt<bool> AddWIMetadata;
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemHandlerChooser.cc b/src/llvmopencl/WorkitemHandlerChooser.cc
new file mode 100644
index 0000000..4fcd226
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandlerChooser.cc
@@ -0,0 +1,111 @@
+// LLVM function pass to select the best way to create a work group
+// function for a kernel and work group size.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem-loops"
+
+#include "WorkitemHandlerChooser.h"
+#include "WorkitemLoops.h"
+#include "WorkitemReplication.h"
+#include "Workgroup.h"
+#include "CanonicalizeBarriers.h"
+#include "Kernel.h"
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+#include <iostream>
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<WorkitemHandlerChooser> X(
+ "workitem-handler-chooser",
+ "Finds the best way to handle work-items to produce a multi-WG function.",
+ false, false);
+
+}
+
+namespace pocl {
+
+char WorkitemHandlerChooser::ID = 0;
+
+void
+WorkitemHandlerChooser::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.setPreservesAll();
+}
+
+
+bool
+WorkitemHandlerChooser::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+#if 0
+ std::string method = "auto";
+ if (getenv("POCL_WORK_GROUP_METHOD") != NULL)
+ {
+ method = getenv("POCL_WORK_GROUP_METHOD");
+ if (method == "repl" || method == "workitemrepl")
+ chosenHandler_ = POCL_WIH_FULL_REPLICATION;
+ else if (method == "loops" || method == "workitemloops")
+ chosenHandler_ = POCL_WIH_LOOPS;
+ else if (method != "auto")
+ {
+ std::cerr << "Unknown work group generation method. Using 'auto'." << std::endl;
+ method = "auto";
+ }
+ }
+
+ if (method == "auto")
+ {
+ size_t ReplThreshold = 2;
+ if (getenv("POCL_FULL_REPLICATION_THRESHOLD") != NULL)
+ {
+ ReplThreshold = atoi(getenv("POCL_FULL_REPLICATION_THRESHOLD"));
+ }
+
+ if (LocalSizeX*LocalSizeY*LocalSizeZ <= ReplThreshold)
+ {
+ chosenHandler_ = POCL_WIH_FULL_REPLICATION;
+ }
+ else
+ {
+ chosenHandler_ = POCL_WIH_LOOPS;
+ }
+ }
+#else
+ chosenHandler_ = POCL_WIH_LOOPS;
+#endif
+
+ return false;
+}
+
+}
diff --git a/src/llvmopencl/WorkitemHandlerChooser.h b/src/llvmopencl/WorkitemHandlerChooser.h
new file mode 100644
index 0000000..ae317e3
--- /dev/null
+++ b/src/llvmopencl/WorkitemHandlerChooser.h
@@ -0,0 +1,52 @@
+// Header for WorkitemHandlerChooser function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_HANDLER_CHOOSER_H
+#define _POCL_WORKITEM_HANDLER_CHOOSER_H
+
+#include "WorkitemHandler.h"
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemHandlerChooser : public pocl::WorkitemHandler {
+ public:
+ static char ID;
+
+ enum WorkitemHandlerType {
+ POCL_WIH_FULL_REPLICATION,
+ POCL_WIH_LOOPS
+ };
+
+ WorkitemHandlerChooser() : pocl::WorkitemHandler(ID),
+ chosenHandler_(POCL_WIH_LOOPS) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ WorkitemHandlerType chosenHandler() { return chosenHandler_; }
+ private:
+ WorkitemHandlerType chosenHandler_;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemLoops.cc b/src/llvmopencl/WorkitemLoops.cc
new file mode 100644
index 0000000..91eb055
--- /dev/null
+++ b/src/llvmopencl/WorkitemLoops.cc
@@ -0,0 +1,1061 @@
+// LLVM function pass to create loops that run all the work items
+// in a work group while respecting barrier synchronization points.
+//
+// Copyright (c) 2012-2014 Pekka Jääskeläinen / Tampere University of Technology
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem-loops"
+
+#include "WorkitemLoops.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "config.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/TypeBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#include <llvm/Support/InstIterator.h>
+#include "WorkitemHandlerChooser.h"
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+//#define DUMP_RESULT_CFG
+
+#ifdef DUMP_RESULT_CFG
+#include "llvm/Analysis/CFGPrinter.h"
+#endif
+
+//#define DEBUG_WORK_ITEM_LOOPS
+
+using namespace llvm;
+using namespace pocl;
+
+namespace {
+ static
+ RegisterPass<WorkitemLoops> X("workitemloops",
+ "Workitem loop generation pass");
+}
+
+char WorkitemLoops::ID = 0;
+
+void
+WorkitemLoops::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addRequired<PostDominatorTree>();
+ AU.addRequired<LoopInfo>();
+// TODO - Removed due to compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+#endif
+#endif
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+WorkitemLoops::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_LOOPS)
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+ PDT = &getAnalysis<PostDominatorTree>();
+
+ tempInstructionIndex = 0;
+
+#if 0
+ std::cerr << "### original:" << std::endl;
+ F.viewCFG();
+#endif
+
+ bool changed = ProcessFunction(F);
+#ifdef DUMP_RESULT_CFG
+ FunctionPass* cfgPrinter = createCFGOnlyPrinterPass();
+ cfgPrinter->runOnFunction(F);
+#endif
+
+#if 0
+ std::cerr << "### after:" << std::endl;
+ F.viewCFG();
+#endif
+
+ changed |= fixUndominatedVariableUses(DT, F);
+
+#if 0
+ /* Split large BBs so we can print the Dot without it crashing. */
+ bool fchanged = false;
+ const int MAX_INSTRUCTIONS_PER_BB = 70;
+ do {
+ fchanged = false;
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ BasicBlock *b = i;
+
+ if (b->size() > MAX_INSTRUCTIONS_PER_BB + 1)
+ {
+ int count = 0;
+ BasicBlock::iterator splitPoint = b->begin();
+ while (count < MAX_INSTRUCTIONS_PER_BB || isa<PHINode>(splitPoint))
+ {
+ ++splitPoint;
+ ++count;
+ }
+ SplitBlock(b, splitPoint, this);
+ fchanged = true;
+ break;
+ }
+ }
+
+ } while (fchanged);
+
+ F.viewCFG();
+#endif
+ contextArrays.clear();
+ tempInstructionIds.clear();
+
+ return changed;
+}
+
+std::pair<llvm::BasicBlock *, llvm::BasicBlock *>
+WorkitemLoops::CreateLoopAround
+(ParallelRegion &region,
+ llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB,
+ bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim,
+ bool addIncBlock, llvm::Instruction *lsizeDim)
+{
+ assert (localIdVar != NULL);
+
+ /*
+
+ Generate a structure like this for each loop level (x,y,z):
+
+ for.init:
+
+ ; if peeledFirst is false:
+ store i32 0, i32* %_local_id_x, align 4
+
+ ; if peeledFirst is true (assume the 0,0,0 iteration has been executed earlier)
+ ; assume _local_id_x_first is is initialized to 1 in the peeled pregion copy
+ store _local_id_x_first, i32* %_local_id_x, align 4
+ store i32 0, %_local_id_x_first
+
+ br label %for.body
+
+ for.body:
+
+ ; the parallel region code here
+
+ br label %for.inc
+
+ for.inc:
+
+ ; Separated inc and cond check blocks for easier loop unrolling later on.
+ ; Can then chain N times for.body+for.inc to unroll.
+
+ %2 = load i32* %_local_id_x, align 4
+ %inc = add nsw i32 %2, 1
+
+ store i32 %inc, i32* %_local_id_x, align 4
+ br label %for.cond
+
+ for.cond:
+
+ ; loop header, compare the id to the local size
+ %0 = load i32* %_local_id_x, align 4
+ %cmp = icmp ult i32 %0, i32 123
+ br i1 %cmp, label %for.body, label %for.end
+
+ for.end:
+
+ OPTIMIZE: Use a separate iteration variable across all the loops to iterate the context
+ data arrays to avoid needing multiplications to find the correct location, and to
+ enable easy vectorization of loading the context data when there are parallel iterations.
+ */
+
+ llvm::BasicBlock *loopBodyEntryBB = entryBB;
+ llvm::LLVMContext &C = loopBodyEntryBB->getContext();
+ llvm::Function *F = loopBodyEntryBB->getParent();
+ loopBodyEntryBB->setName("pregion.for.body");
+
+ assert (exitBB->getTerminator()->getNumSuccessors() == 1);
+
+ llvm::BasicBlock *oldExit = exitBB->getTerminator()->getSuccessor(0);
+
+ llvm::BasicBlock *forInitBB =
+ BasicBlock::Create(C, "pregion.for.init", F, loopBodyEntryBB);
+
+ llvm::BasicBlock *loopEndBB =
+ BasicBlock::Create(C, "pregion.for.end", F, exitBB);
+
+ llvm::BasicBlock *forCondBB =
+ BasicBlock::Create(C, "pregion.for.cond", F, exitBB);
+
+ DT->runOnFunction(*F);
+
+ // F->viewCFG();
+ /* Fix the old edges jumping to the region to jump to the basic block
+ that starts the created loop. Back edges should still point to the
+ old basic block so we preserve the old loops. */
+ BasicBlockVector preds;
+ llvm::pred_iterator PI =
+ llvm::pred_begin(entryBB),
+ E = llvm::pred_end(entryBB);
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ preds.push_back(bb);
+ }
+
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ /* Do not fix loop edges inside the region. The loop
+ is replicated as a whole to the body of the wi-loop.*/
+ if (DT->dominates(loopBodyEntryBB, bb))
+ continue;
+ bb->getTerminator()->replaceUsesOfWith(loopBodyEntryBB, forInitBB);
+ }
+
+ IRBuilder<> builder(forInitBB);
+
+ if (peeledFirst)
+ {
+ builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar);
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar);
+ }
+ else
+ {
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar);
+ }
+
+ builder.CreateBr(loopBodyEntryBB);
+
+ exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB);
+ if (addIncBlock)
+ {
+ AppendIncBlock(exitBB, localIdVar);
+ }
+
+ builder.SetInsertPoint(forCondBB);
+
+ llvm::Value *cmpResult;
+ if (lsizeDim == NULL)
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ (ConstantInt::get
+ (IntegerType::get(C, size_t_width),
+ LocalSizeForDim))
+ );
+ }
+ else
+ {
+ cmpResult =
+ builder.CreateICmpULT
+ (builder.CreateLoad(localIdVar),
+ lsizeDim
+ );
+ }
+
+ Instruction *loopBranch =
+ builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB);
+
+ /* Add the metadata to mark a parallel loop. The metadata
+ refer to a loop-unique dummy metadata that is not merged
+ automatically. */
+
+ /* This creation of the identifier metadata is copied from
+ LLVM's MDBuilder::createAnonymousTBAARoot(). */
+ MDNode *Dummy = MDNode::getTemporary(C, ArrayRef<Value*>());
+ MDNode *Root = MDNode::get(C, Dummy);
+ // At this point we have
+ // !0 = metadata !{} <- dummy
+ // !1 = metadata !{metadata !0} <- root
+ // Replace the dummy operand with the root node itself and delete the dummy.
+ Root->replaceOperandWith(0, Root);
+ MDNode::deleteTemporary(Dummy);
+ // We now have
+ // !1 = metadata !{metadata !1} <- self-referential root
+
+ loopBranch->setMetadata("llvm.loop.parallel", Root);
+ region.AddParallelLoopMetadata(Root);
+
+ builder.SetInsertPoint(loopEndBB);
+ builder.CreateBr(oldExit);
+
+ return std::make_pair(forInitBB, loopEndBB);
+}
+
+ParallelRegion*
+WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb)
+{
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ if (region->HasBlock(bb)) return region;
+ }
+ return NULL;
+}
+
+// PreAnalyze kernel function, find out dimension (borrowed from wga)
+// PreCreate local sizes which are workgroup invariant
+void WorkitemLoops::FindKernelDim(Function &F)
+{
+ maxDim = 1;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ if (CallInst * callInst = dyn_cast<CallInst>(&*I))
+ {
+ if (!callInst->getCalledFunction()) continue;
+ std::string functionName(callInst->getCalledFunction()->getName());
+
+ if (functionName == "get_local_id" ||
+ functionName == "get_global_id")
+ {
+ Value *arg = callInst->getArgOperand(0);
+ if (ConstantInt * constInt = dyn_cast<ConstantInt>(arg))
+ {
+ unsigned int dimIdx = constInt->getSExtValue();
+ dimIdx = (MAX_DIMENSIONS-1 < dimIdx) ? MAX_DIMENSIONS-1 : dimIdx;
+ maxDim = (maxDim < dimIdx + 1) ? dimIdx+1 : maxDim;
+ }
+
+ /*-------------------------------------------------------------
+ * if the work group function has a variable argument, then
+ * assume worst case and return 3 loop levels are needed.
+ *------------------------------------------------------------*/
+ else
+ {
+ maxDim = 3;
+ break;
+ }
+ }
+ }
+
+ llvm::Module *M = F.getParent();
+ llvm::Type *Int32 = IntegerType::get(M->getContext(), 32);
+ FunctionType *ft = FunctionType::get
+ (/*Result=*/ Int32,
+ /*Params=*/ Int32,
+ /*isVarArg=*/ false);
+ Function *f_localsize =
+ dyn_cast<Function>(M->getOrInsertFunction("get_local_size", ft));
+ SmallVector<Value *, 4> argsx, argsy, argsz;
+ argsx.push_back(ConstantInt::get(Int32, 0));
+ lsizeX = CallInst::Create(f_localsize, ArrayRef<Value *>(argsx));
+ if (maxDim > 1)
+ {
+ argsy.push_back(ConstantInt::get(Int32, 1));
+ lsizeY = CallInst::Create(f_localsize, ArrayRef<Value *>(argsy));
+ }
+ if (maxDim > 2)
+ {
+ argsz.push_back(ConstantInt::get(Int32, 2));
+ lsizeZ = CallInst::Create(f_localsize, ArrayRef<Value *>(argsz));
+ }
+}
+
+bool
+WorkitemLoops::ProcessFunction(Function &F)
+{
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+#if 0 // TODO: do something for reqd_work_group_size
+ unsigned workItemCount = LocalSizeX*LocalSizeY*LocalSizeZ;
+ if (workItemCount == 1)
+ {
+ K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+ return true;
+ }
+#endif
+
+ FindKernelDim(F);
+
+ original_parallel_regions =
+ K->getParallelRegions(LI);
+
+ IRBuilder<> builder(F.getEntryBlock().getFirstInsertionPt());
+ localIdXFirstVar =
+ builder.CreateAlloca
+ (IntegerType::get(F.getContext(), size_t_width), 0, ".pocl.local_id_x_init");
+
+ // F.viewCFGOnly();
+
+#if 0
+ std::cerr << "### Original" << std::endl;
+ F.viewCFG();
+#endif
+
+#if 0
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ region->InjectRegionPrintF();
+ region->InjectVariablePrintouts();
+ }
+#endif
+
+ /* Count how many parallel regions share each entry node to
+ detect diverging regions that need to be peeled. */
+ std::map<llvm::BasicBlock*, int> entryCounts;
+
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### Adding context save/restore for PR: ";
+ region->dumpNames();
+#endif
+ FixMultiRegionVariables(region);
+ entryCounts[region->entryBB()]++;
+ }
+
+#if 0
+ std::cerr << "### After context code addition:" << std::endl;
+ F.viewCFG();
+#endif
+ std::map<ParallelRegion*, bool> peeledRegion;
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+
+ llvm::ValueToValueMapTy reference_map;
+ ParallelRegion *original = (*i);
+
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### handling region:" << std::endl;
+ original->dumpNames();
+ //F.viewCFGOnly();
+#endif
+
+ /* In case of conditional barriers, the first iteration
+ has to be peeled so we know which branch to execute
+ with the work item loop. In case there are more than one
+ parallel region sharing an entry BB, it's a diverging
+ region.
+
+ Post dominance of entry by exit does not work in case the
+ region is inside a loop and the exit block is in the path
+ towards the loop exit (and the function exit).
+ */
+ bool peelFirst = entryCounts[original->entryBB()] > 1;
+
+ peeledRegion[original] = peelFirst;
+
+ std::pair<llvm::BasicBlock *, llvm::BasicBlock *> l;
+ // the original predecessor nodes of which successor
+ // should be fixed if not peeling
+ BasicBlockVector preds;
+
+ bool unrolled = false;
+ if (peelFirst)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### conditional region, peeling the first iteration" << std::endl;
+#endif
+ ParallelRegion *replica =
+ original->replicate(reference_map, ".peeled_wi");
+ replica->chainAfter(original);
+ replica->purge();
+
+ l = std::make_pair(replica->entryBB(), replica->exitBB());
+ }
+ else
+ {
+ llvm::pred_iterator PI =
+ llvm::pred_begin(original->entryBB()),
+ E = llvm::pred_end(original->entryBB());
+
+ for (; PI != E; ++PI)
+ {
+ llvm::BasicBlock *bb = *PI;
+ if (DT->dominates(original->entryBB(), bb) &&
+ (RegionOfBlock(original->entryBB()) ==
+ RegionOfBlock(bb)))
+ continue;
+ preds.push_back(bb);
+ }
+
+#if 0
+ int unrollCount;
+ if (getenv("POCL_WILOOPS_MAX_UNROLL_COUNT") != NULL)
+ unrollCount = atoi(getenv("POCL_WILOOPS_MAX_UNROLL_COUNT"));
+ else
+ unrollCount = 1;
+ /* Find a two's exponent unroll count, if available. */
+ while (unrollCount >= 1)
+ {
+ if (LocalSizeX % unrollCount == 0 &&
+ unrollCount <= LocalSizeX)
+ {
+ break;
+ }
+ unrollCount /= 2;
+ }
+
+ if (unrollCount > 1) {
+ ParallelRegion *prev = original;
+ llvm::BasicBlock *lastBB =
+ AppendIncBlock(original->exitBB(), localIdX);
+ original->AddBlockAfter(lastBB, original->exitBB());
+ original->SetExitBB(lastBB);
+
+ if (AddWIMetadata)
+ original->AddIDMetadata(F.getContext(), 0);
+
+ for (int c = 1; c < unrollCount; ++c)
+ {
+ ParallelRegion *unrolled =
+ original->replicate(reference_map, ".unrolled_wi");
+ unrolled->chainAfter(prev);
+ prev = unrolled;
+ lastBB = unrolled->exitBB();
+ if (AddWIMetadata)
+ unrolled->AddIDMetadata(F.getContext(), c);
+ }
+ unrolled = true;
+ l = std::make_pair(original->entryBB(), lastBB);
+ } else {
+ l = std::make_pair(original->entryBB(), original->exitBB());
+ }
+#else
+ l = std::make_pair(original->entryBB(), original->exitBB());
+#endif
+ }
+
+ l = CreateLoopAround(*original, l.first, l.second, peelFirst, localIdX,
+ LocalSizeX, true, lsizeX);
+ if (maxDim > 1)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdY,
+ LocalSizeY, true, lsizeY);
+ if (maxDim > 2)
+ l = CreateLoopAround(*original, l.first, l.second, false, localIdZ,
+ LocalSizeZ, true, lsizeZ);
+
+ /* Loop edges coming from another region mean B-loops which means
+ we have to fix the loop edge to jump to the beginning of the wi-loop
+ structure, not its body. This has to be done only for non-peeled
+ blocks as the semantics is correct in the other case (the jump is
+ to the beginning of the peeled iteration). */
+ if (!peelFirst)
+ {
+ for (BasicBlockVector::iterator i = preds.begin();
+ i != preds.end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ bb->getTerminator()->replaceUsesOfWith
+ (original->entryBB(), l.first);
+ }
+ }
+ }
+
+ // for the peeled regions we need to add a prologue
+ // that initializes the local ids and the first iteration
+ // counter
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *pr = (*i);
+
+ if (!peeledRegion[pr]) continue;
+ pr->insertPrologue(0, 0, 0);
+ builder.SetInsertPoint(pr->entryBB()->getFirstInsertionPt());
+ builder.CreateStore
+ (ConstantInt::get(IntegerType::get(F.getContext(), size_t_width), 1),
+ localIdXFirstVar);
+ }
+
+ // Creating lsize* values have been hoisted up
+ // K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+ llvm::Instruction *inspt = F.getEntryBlock().getFirstNonPHI();
+ inspt->getParent()->getInstList().insert(inspt, lsizeX);
+ if (maxDim > 1)
+ inspt->getParent()->getInstList().insert(inspt, lsizeY);
+ if (maxDim > 2)
+ inspt->getParent()->getInstList().insert(inspt, lsizeZ);
+ // llvm::GlobalVariable *gvx = M->getGlobalVariable("_local_size_x");
+ // llvm::GlobalVariable *gvy = M->getGlobalVariable("_local_size_y");
+ // llvm::GlobalVariable *gvz = M->getGlobalVariable("_local_size_z");
+ // llvm::Instruction *storex = new StoreInst(lsizeX, gvx, inspt);
+ // llvm::Instruction *storey = new StoreInst(lsizeY, gvy, inspt);
+ // llvm::Instruction *storez = new StoreInst(lsizeZ, gvz, inspt);
+
+
+ ParallelRegion::insertLocalIdInit(&F.getEntryBlock(), 0, 0, 0);
+
+#if 0
+ F.viewCFG();
+#endif
+
+ return true;
+}
+
+/*
+ * Add context save/restore code to variables that are defined in
+ * the given region and are used outside the region.
+ *
+ * Each such variable gets a slot in the stack frame. The variable
+ * is restored from the stack whenever it's used.
+ *
+ */
+void
+WorkitemLoops::FixMultiRegionVariables(ParallelRegion *region)
+{
+ InstructionIndex instructionsInRegion;
+ InstructionVec instructionsToFix;
+
+ /* Construct an index of the region's instructions so it's
+ fast to figure out if the variable uses are all
+ in the region. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+ instructionsInRegion.insert(instruction);
+ }
+ }
+
+ /* Find all the instructions that define new values and
+ check if they need to be context saved. */
+ for (BasicBlockVector::iterator i = region->begin();
+ i != region->end(); ++i)
+ {
+ llvm::BasicBlock *bb = *i;
+ for (llvm::BasicBlock::iterator instr = bb->begin();
+ instr != bb->end(); ++instr)
+ {
+ llvm::Instruction *instruction = instr;
+
+ if (ShouldNotBeContextSaved(instr)) continue;
+
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ // if the instruction is used outside this region inside another
+ // region (not in a regionless BB like the B-loop construct BBs),
+ // need to context save it.
+ if (instructionsInRegion.find(user) == instructionsInRegion.end() &&
+ RegionOfBlock(user->getParent()) != NULL)
+ {
+ instructionsToFix.push_back(instruction);
+ break;
+ }
+ }
+ }
+ }
+
+ /* Finally, fix the instructions. */
+ for (InstructionVec::iterator i = instructionsToFix.begin();
+ i != instructionsToFix.end(); ++i)
+ {
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context/save restore for" << std::endl;
+ (*i)->dump();
+#endif
+ llvm::Instruction *instructionToFix = *i;
+ AddContextSaveRestore(instructionToFix);
+ }
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextSave
+(llvm::Instruction *instruction, llvm::Instruction *alloca)
+{
+
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved is itself an alloca,
+ we have created one big alloca that stores the data of all the
+ work-items and return pointers to that array. Thus, we need
+ no initialization code other than the context data alloca itself. */
+ return NULL;
+ }
+
+ /* Save the produced variable to the array. */
+ BasicBlock::iterator definition = dyn_cast<Instruction>(instruction);
+
+ ++definition;
+ while (isa<PHINode>(definition)) ++definition;
+
+ IRBuilder<> builder(definition);
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(instruction->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize index computation for store into alloca
+// alloca[idz * sizey*sizex + idy * sizex + idx]
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ return builder.CreateStore(instruction, builder.CreateGEP(alloca, gepArgs));
+
+}
+
+llvm::Instruction *
+WorkitemLoops::AddContextRestore
+(llvm::Value *val, llvm::Instruction *alloca, llvm::Instruction *before,
+ bool isAlloca)
+{
+ assert (val != NULL);
+ IRBuilder<> builder(alloca);
+ if (before != NULL)
+ {
+ builder.SetInsertPoint(before);
+ }
+ else if (isa<Instruction>(val))
+ {
+ builder.SetInsertPoint(dyn_cast<Instruction>(val));
+ before = dyn_cast<Instruction>(val);
+ }
+ else
+ {
+ assert (false && "Unknown context restore location!");
+ }
+
+
+ std::vector<llvm::Value *> gepArgs;
+
+ /* Reuse the id loads earlier in the region, if possible, to
+ avoid messy output with lots of redundant loads. */
+ ParallelRegion *region = RegionOfBlock(before->getParent());
+ assert ("Adding context save outside any region produces illegal code." &&
+ region != NULL);
+
+// linearize alloca loads
+// idz * _local_size_x * _local_size_y + idy * _local_size_x + idx
+ llvm::Value *linear_index = region->LocalIDXLoad();
+ if (maxDim > 1)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDYLoad(),
+ lsizeX) );
+ if (maxDim > 2)
+ linear_index = builder.CreateAdd(linear_index,
+ builder.CreateMul(region->LocalIDZLoad(),
+ builder.CreateMul(lsizeY, lsizeX)) );
+ gepArgs.push_back(linear_index);
+
+ llvm::Instruction *gep =
+ dyn_cast<Instruction>(builder.CreateGEP(alloca, gepArgs));
+
+ if (isAlloca) {
+ /* In case the context saved instruction was an alloca, we created a
+ context array with pointed-to elements, and now want to return a pointer
+ to the elements to emulate the original alloca. */
+ return gep;
+ }
+ return builder.CreateLoad(gep);
+}
+
+/**
+ * Returns the context array (alloca) for the given Value, creates it if not
+ * found.
+ */
+llvm::Instruction *
+WorkitemLoops::GetContextArray(llvm::Instruction *instruction)
+{
+
+ /*
+ * Unnamed temp instructions need a generated name for the
+ * context array. Create one using a running integer.
+ */
+ std::ostringstream var;
+ var << ".";
+
+ if (std::string(instruction->getName().str()) != "")
+ {
+ var << instruction->getName().str();
+ }
+ else if (tempInstructionIds.find(instruction) != tempInstructionIds.end())
+ {
+ var << tempInstructionIds[instruction];
+ }
+ else
+ {
+ tempInstructionIds[instruction] = tempInstructionIndex++;
+ var << tempInstructionIds[instruction];
+ }
+
+ var << ".pocl_context";
+ std::string varName = var.str();
+
+ if (contextArrays.find(varName) != contextArrays.end())
+ return contextArrays[varName];
+
+ IRBuilder<> builder(instruction->getParent()->getParent()->getEntryBlock().getFirstInsertionPt());
+
+ llvm::Type *elementType;
+ if (isa<AllocaInst>(instruction))
+ {
+ /* If the variable to be context saved was itself an alloca,
+ create one big alloca that stores the data of all the
+ work-items and directly return pointers to that array.
+ This enables moving all the allocas to the entry node without
+ breaking the parallel loop.
+ Otherwise we would rely on a dynamic alloca to allocate
+ unique stack space to all the work-items when its wiloop
+ iteration is executed. */
+ elementType =
+ dyn_cast<AllocaInst>(instruction)->getType()->getElementType();
+ }
+ else
+ {
+ elementType = instruction->getType();
+ }
+
+// parameterize alloca to be based on _local_size_{x,y,z}
+ llvm::Value *wgsize = lsizeX;
+ if (maxDim > 1) wgsize = builder.CreateMul(wgsize, lsizeY);
+ if (maxDim > 2) wgsize = builder.CreateMul(wgsize, lsizeZ);
+ llvm::Type *contextArrayType = ArrayType::get(elementType, 1);
+ llvm::Instruction *alloca =
+ builder.CreateAlloca(elementType, wgsize, varName);
+
+ contextArrays[varName] = alloca;
+ return alloca;
+}
+
+
+/**
+ * Adds context save/restore code for the value produced by the
+ * given instruction.
+ *
+ * TODO: add only one restore per variable per region.
+ * TODO: add only one load of the id variables per region.
+ * Could be done by having a context restore BB in the beginning of the
+ * region and a context save BB at the end.
+ * TODO: ignore work group variables completely (the iteration variables)
+ * The LLVM should optimize these away but it would improve
+ * the readability of the output during debugging.
+ * TODO: rematerialize some values such as extended values of global
+ * variables (especially global id which is computed from local id) or kernel
+ * argument values instead of allocating stack space for them
+ */
+void
+WorkitemLoops::AddContextSaveRestore
+(llvm::Instruction *instruction) {
+
+ /* Allocate the context data array for the variable. */
+ llvm::Instruction *alloca = GetContextArray(instruction);
+ llvm::Instruction *theStore = AddContextSave(instruction, alloca);
+
+ InstructionVec uses;
+ /* Restore the produced variable before each use to ensure the correct context
+ copy is used.
+
+ We could add the restore only to other regions outside the
+ variable defining region and use the original variable in the defining
+ region due to the SSA virtual registers being unique. However,
+ alloca variables can be redefined also in the same region, thus we
+ need to ensure the correct alloca context position is written, not
+ the original unreplicated one. These variables can be generated by
+ volatile variables, private arrays, and due to the PHIs to allocas
+ pass.
+ */
+
+ /* Find out the uses to fix first as fixing them invalidates
+ the iterator. */
+ for (Instruction::use_iterator ui = instruction->use_begin(),
+ ue = instruction->use_end();
+ ui != ue; ++ui)
+ {
+ Instruction *user;
+ if ((user = dyn_cast<Instruction> (*ui)) == NULL) continue;
+ if (user == theStore) continue;
+ uses.push_back(user);
+ }
+
+ for (InstructionVec::iterator i = uses.begin(); i != uses.end(); ++i)
+ {
+ Instruction *user = *i;
+ Instruction *contextRestoreLocation = user;
+ /* If the user is in a block that doesn't belong to a region,
+ the variable itself must be a "work group variable", that is,
+ not dependent on the work item. Most likely an iteration
+ variable of a for loop with a barrier. */
+ if (RegionOfBlock(user->getParent()) == NULL) continue;
+
+ PHINode* phi = dyn_cast<PHINode>(user);
+ if (phi != NULL)
+ {
+ /* In case of PHI nodes, we cannot just insert the context
+ restore code before it in the same basic block because it is
+ assumed there are no non-phi Instructions before PHIs which
+ the context restore code constitutes to. Add the context
+ restore to the incomingBB instead.
+
+ There can be values in the PHINode that are incoming
+ from another region even though the decision BB is within the region.
+ For those values we need to add the context restore code in the
+ incoming BB (which is known to be inside the region due to the
+ assumption of not having to touch PHI nodes in PRentry BBs).
+ */
+
+ /* PHINodes at region entries are broken down earlier. */
+ assert ("Cannot add context restore for a PHI node at the region entry!" &&
+ RegionOfBlock(phi->getParent())->entryBB() != phi->getParent());
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### adding context restore code before PHI" << std::endl;
+ user->dump();
+ std::cerr << "### in BB:" << std::endl;
+ user->getParent()->dump();
+#endif
+ BasicBlock *incomingBB = NULL;
+ for (unsigned incoming = 0; incoming < phi->getNumIncomingValues();
+ ++incoming)
+ {
+ Value *val = phi->getIncomingValue(incoming);
+ BasicBlock *bb = phi->getIncomingBlock(incoming);
+ if (val == instruction) incomingBB = bb;
+ }
+ assert (incomingBB != NULL);
+ contextRestoreLocation = incomingBB->getTerminator();
+ }
+ llvm::Value *loadedValue =
+ AddContextRestore
+ (user, alloca, contextRestoreLocation, isa<AllocaInst>(instruction));
+ user->replaceUsesOfWith(instruction, loadedValue);
+#ifdef DEBUG_WORK_ITEM_LOOPS
+ std::cerr << "### done, the user was converted to:" << std::endl;
+ user->dump();
+#endif
+ }
+}
+
+bool
+WorkitemLoops::ShouldNotBeContextSaved(llvm::Instruction *instr)
+{
+ /*
+ _local_id loads should not be replicated as it leads to
+ problems in conditional branch case where the header node
+ of the region is shared across the branches and thus the
+ header node's ID loads might get context saved which leads
+ to egg-chicken problems.
+ */
+ llvm::LoadInst *load = dyn_cast<llvm::LoadInst>(instr);
+ if (load != NULL &&
+ (load->getPointerOperand() == localIdZ ||
+ load->getPointerOperand() == localIdY ||
+ load->getPointerOperand() == localIdX))
+ return true;
+ return false;
+}
+
+llvm::BasicBlock *
+WorkitemLoops::AppendIncBlock
+(llvm::BasicBlock* after, llvm::Value *localIdVar)
+{
+ llvm::LLVMContext &C = after->getContext();
+
+ llvm::BasicBlock *oldExit = after->getTerminator()->getSuccessor(0);
+ assert (oldExit != NULL);
+
+ llvm::BasicBlock *forIncBB =
+ BasicBlock::Create(C, "pregion.for.inc", after->getParent());
+
+ after->getTerminator()->replaceUsesOfWith(oldExit, forIncBB);
+
+ IRBuilder<> builder(oldExit);
+
+ builder.SetInsertPoint(forIncBB);
+ /* Create the iteration variable increment */
+ builder.CreateStore
+ (builder.CreateAdd
+ (builder.CreateLoad(localIdVar),
+ ConstantInt::get(IntegerType::get(C, size_t_width), 1)),
+ localIdVar);
+
+ builder.CreateBr(oldExit);
+
+ return forIncBB;
+}
diff --git a/src/llvmopencl/WorkitemLoops.h b/src/llvmopencl/WorkitemLoops.h
new file mode 100644
index 0000000..aac4cfa
--- /dev/null
+++ b/src/llvmopencl/WorkitemLoops.h
@@ -0,0 +1,112 @@
+// Header for WorkitemLoops function pass.
+//
+// Copyright (c) 2012 Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_LOOPS_H
+#define _POCL_WORKITEM_LOOPS_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <vector>
+#include "WorkitemHandler.h"
+#include "ParallelRegion.h"
+
+#define MAX_DIMENSIONS 3u
+
+namespace llvm {
+ class PostDominatorTree;
+}
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemLoops : public pocl::WorkitemHandler {
+
+ public:
+ static char ID;
+
+ WorkitemLoops() : pocl::WorkitemHandler(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+ private:
+
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::set<llvm::Instruction* > InstructionIndex;
+ typedef std::vector<llvm::Instruction* > InstructionVec;
+ typedef std::map<std::string, llvm::Instruction*> StrInstructionMap;
+
+ InstructionIndex workGroupVariables;
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+ llvm::PostDominatorTree *PDT;
+
+ ParallelRegion::ParallelRegionVector *original_parallel_regions;
+
+ StrInstructionMap contextArrays;
+
+ virtual bool ProcessFunction(llvm::Function &F);
+
+ void FixMultiRegionVariables(ParallelRegion *region);
+ void AddContextSaveRestore(llvm::Instruction *instruction);
+
+ llvm::Instruction *AddContextSave(llvm::Instruction *instruction, llvm::Instruction *alloca);
+ llvm::Instruction *AddContextRestore
+ (llvm::Value *val, llvm::Instruction *alloca,
+ llvm::Instruction *before=NULL,
+ bool isAlloca=false);
+ llvm::Instruction *GetContextArray(llvm::Instruction *val);
+
+ std::pair<llvm::BasicBlock *, llvm::BasicBlock *>
+ CreateLoopAround
+ (ParallelRegion &region, llvm::BasicBlock *entryBB, llvm::BasicBlock *exitBB,
+ bool peeledFirst, llvm::Value *localIdVar, size_t LocalSizeForDim,
+ bool addIncBlock=true, llvm::Instruction *lsizeDim=NULL);
+ void FindKernelDim(llvm::Function &F);
+
+ llvm::BasicBlock *
+ AppendIncBlock
+ (llvm::BasicBlock* after,
+ llvm::Value *localIdVar);
+
+ ParallelRegion* RegionOfBlock(llvm::BasicBlock *bb);
+
+ bool ShouldNotBeContextSaved(llvm::Instruction *instr);
+
+ std::map<llvm::Instruction*, unsigned> tempInstructionIds;
+ size_t tempInstructionIndex;
+ // An alloca in the kernel which stores the first iteration to execute
+ // in the inner (dimension 0) loop. This is set to 1 in an peeled iteration
+ // to skip the 0, 0, 0 iteration in the loops.
+ llvm::Value *localIdXFirstVar;
+
+ unsigned int maxDim;
+ llvm::Instruction *lsizeX, *lsizeY, *lsizeZ;
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/WorkitemReplication.cc b/src/llvmopencl/WorkitemReplication.cc
new file mode 100644
index 0000000..b6ea3cd
--- /dev/null
+++ b/src/llvmopencl/WorkitemReplication.cc
@@ -0,0 +1,308 @@
+// LLVM function pass to replicate the kernel body for all work items
+// in a work group.
+//
+// Copyright (c) 2011-2012 Carlos Sánchez de La Lama / URJC and
+// Pekka Jääskeläinen / TUT
+// Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define DEBUG_TYPE "workitem"
+
+#include "WorkitemReplication.h"
+#include "Workgroup.h"
+#include "Barrier.h"
+#include "Kernel.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "config.h"
+#ifdef LLVM_3_1
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#elif defined LLVM_3_2
+#include "llvm/IRBuilder.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/ValueSymbolTable.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#endif
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "WorkitemHandlerChooser.h"
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+//#define DEBUG_BB_MERGING
+//#define DUMP_RESULT_CFG
+//#define DEBUG_PR_REPLICATION
+
+#ifdef DUMP_RESULT_CFG
+#include "llvm/Analysis/CFGPrinter.h"
+#endif
+
+using namespace llvm;
+using namespace pocl;
+
+STATISTIC(ContextValues, "Number of SSA values which have to be context-saved");
+STATISTIC(ContextSize, "Context size per workitem in bytes");
+
+namespace {
+ static
+ RegisterPass<WorkitemReplication> X("workitemrepl", "Workitem replication pass");
+}
+
+char WorkitemReplication::ID = 0;
+
+void
+WorkitemReplication::getAnalysisUsage(AnalysisUsage &AU) const
+{
+ AU.addRequired<DominatorTree>();
+ AU.addRequired<LoopInfo>();
+
+// TODO - removed due to compilation error
+#if 0
+#ifdef LLVM_3_1
+ AU.addRequired<TargetData>();
+#else
+ AU.addRequired<DataLayout>();
+#endif
+#endif
+ AU.addRequired<pocl::WorkitemHandlerChooser>();
+}
+
+bool
+WorkitemReplication::runOnFunction(Function &F)
+{
+ if (!Workgroup::isKernelToProcess(F))
+ return false;
+
+ if (getAnalysis<pocl::WorkitemHandlerChooser>().chosenHandler() !=
+ pocl::WorkitemHandlerChooser::POCL_WIH_FULL_REPLICATION)
+ return false;
+
+ DT = &getAnalysis<DominatorTree>();
+ LI = &getAnalysis<LoopInfo>();
+
+ bool changed = ProcessFunction(F);
+#ifdef DUMP_RESULT_CFG
+ FunctionPass* cfgPrinter = createCFGPrinterPass();
+ cfgPrinter->runOnFunction(F);
+#endif
+
+ changed |= fixUndominatedVariableUses(DT, F);
+ return changed;
+}
+
+bool
+WorkitemReplication::ProcessFunction(Function &F)
+{
+ Module *M = F.getParent();
+
+// F.viewCFG();
+
+ Kernel *K = cast<Kernel> (&F);
+ Initialize(K);
+
+ // Allocate space for workitem reference maps. Workitem 0 does
+ // not need it.
+ unsigned workitem_count = LocalSizeZ * LocalSizeY * LocalSizeX;
+
+ BasicBlockVector original_bbs;
+ for (Function::iterator i = F.begin(), e = F.end(); i != e; ++i) {
+ if (!Barrier::hasBarrier(i))
+ original_bbs.push_back(i);
+ }
+
+ ParallelRegion::ParallelRegionVector* original_parallel_regions =
+ K->getParallelRegions(LI);
+
+ std::vector<SmallVector<ParallelRegion *, 8> > parallel_regions(workitem_count);
+
+ parallel_regions[0] = *original_parallel_regions;
+
+ /* Enable to get region identification printouts */
+#if 0
+ for (ParallelRegion::ParallelRegionVector::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i)
+ {
+ ParallelRegion *region = (*i);
+ region->InjectRegionPrintF();
+ region->InjectVariablePrintouts();
+ }
+#endif
+
+ // Measure the required context (variables alive in more than one region).
+#ifdef LLVM_3_1
+ TargetData &TD = getAnalysis<TargetData>();
+#else
+ DataLayout &TD = getAnalysis<DataLayout>();
+#endif
+
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(), e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *pr = (*i);
+
+ for (ParallelRegion::iterator i2 = pr->begin(), e2 = pr->end();
+ i2 != e2; ++i2) {
+ BasicBlock *bb = (*i2);
+
+ for (BasicBlock::iterator i3 = bb->begin(), e3 = bb->end();
+ i3 != e3; ++i3) {
+ for (Value::use_iterator i4 = i3->use_begin(), e4 = i3->use_end();
+ i4 != e4; ++i4) {
+ // Instructions can only be used by instructions.
+ Instruction *user = cast<Instruction> (*i4);
+
+ if (find (pr->begin(), pr->end(), user->getParent()) ==
+ pr->end()) {
+ // User is not in the defining region.
+ ++ContextValues;
+ ContextSize += TD.getTypeAllocSize(i3->getType());
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Then replicate the ParallelRegions.
+ ValueToValueMapTy *const reference_map = new ValueToValueMapTy[workitem_count - 1];
+ for (int z = 0; z < LocalSizeZ; ++z) {
+ for (int y = 0; y < LocalSizeY; ++y) {
+ for (int x = 0; x < LocalSizeX ; ++x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ if (index == 0)
+ continue;
+
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *original = (*i);
+ ParallelRegion *replicated =
+ original->replicate
+ (reference_map[index - 1],
+ (".wi_" + Twine(x) + "_" + Twine(y) + "_" + Twine(z)));
+ if (AddWIMetadata)
+ replicated->AddIDMetadata(M->getContext(), x, y, z);
+ parallel_regions[index].push_back(replicated);
+#ifdef DEBUG_PR_REPLICATION
+ std::cerr << "### new replica:" << std::endl;
+ replicated->dump();
+#endif
+ }
+ }
+ }
+ }
+ if (AddWIMetadata) {
+ for (SmallVector<ParallelRegion *, 8>::iterator
+ i = original_parallel_regions->begin(),
+ e = original_parallel_regions->end();
+ i != e; ++i) {
+ ParallelRegion *original = (*i);
+ original->AddIDMetadata(M->getContext(), 0, 0, 0);
+ }
+ }
+
+ for (int z = 0; z < LocalSizeZ; ++z) {
+ for (int y = 0; y < LocalSizeY; ++y) {
+ for (int x = 0; x < LocalSizeX ; ++x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) {
+ ParallelRegion *region = parallel_regions[index][i];
+ if (index != 0) {
+ region->remap(reference_map[index - 1]);
+ region->chainAfter(parallel_regions[index - 1][i]);
+ region->purge();
+ }
+ region->insertPrologue(x, y, z);
+ }
+ }
+ }
+ }
+
+ // Try to merge all workitem first block of each region
+ // together (for PHI predecessor correctness).
+ for (int z = LocalSizeZ - 1; z >= 0; --z) {
+ for (int y = LocalSizeY - 1; y >= 0; --y) {
+ for (int x = LocalSizeX - 1; x >= 0; --x) {
+
+ int index =
+ (LocalSizeY * LocalSizeX * z + LocalSizeX * y + x);
+
+ if (index == 0)
+ continue;
+
+ for (unsigned i = 0, e = parallel_regions[index].size(); i != e; ++i) {
+ ParallelRegion *region = parallel_regions[index][i];
+ BasicBlock *entry = region->entryBB();
+
+ assert (entry != NULL);
+ BasicBlock *pred = entry->getUniquePredecessor();
+ assert (pred != NULL && "No unique predecessor.");
+#ifdef DEBUG_BB_MERGING
+ std::cerr << "### pred before merge into predecessor " << std::endl;
+ pred->dump();
+ std::cerr << "### entry before merge into predecessor " << std::endl;
+ entry->dump();
+#endif
+ movePhiNodes(entry, pred);
+ }
+ }
+ }
+ }
+
+ // Add the suffixes to original (wi_0_0_0) basic blocks.
+ for (BasicBlockVector::iterator i = original_bbs.begin(),
+ e = original_bbs.end();
+ i != e; ++i)
+ (*i)->setName((*i)->getName() + ".wi_0_0_0");
+
+ // Initialize local size variables (done at the end to avoid unnecessary
+ // replication).
+ K->addLocalSizeInitCode(LocalSizeX, LocalSizeY, LocalSizeZ);
+
+ delete [] reference_map;
+
+// F.viewCFG();
+
+ return true;
+}
+
diff --git a/src/llvmopencl/WorkitemReplication.h b/src/llvmopencl/WorkitemReplication.h
new file mode 100644
index 0000000..fb5d9d4
--- /dev/null
+++ b/src/llvmopencl/WorkitemReplication.h
@@ -0,0 +1,62 @@
+// Header for WorkitemReplication function pass.
+//
+// Copyright (c) 2011 Universidad Rey Juan Carlos and
+// 2012 Pekka Jääskeläinen / TUT
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef _POCL_WORKITEM_REPLICATION_H
+#define _POCL_WORKITEM_REPLICATION_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+#include <vector>
+#include "WorkitemHandler.h"
+
+namespace pocl {
+ class Workgroup;
+
+ class WorkitemReplication : public pocl::WorkitemHandler {
+
+ public:
+ static char ID;
+
+ WorkitemReplication() : pocl::WorkitemHandler(ID) {}
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
+ virtual bool runOnFunction(llvm::Function &F);
+
+
+ private:
+
+ llvm::DominatorTree *DT;
+ llvm::LoopInfo *LI;
+
+ typedef std::set<llvm::BasicBlock *> BasicBlockSet;
+ typedef std::vector<llvm::BasicBlock *> BasicBlockVector;
+ typedef std::map<llvm::Value *, llvm::Value *> ValueValueMap;
+
+ virtual bool ProcessFunction(llvm::Function &F);
+ };
+}
+
+#endif
diff --git a/src/llvmopencl/config.h b/src/llvmopencl/config.h
new file mode 100644
index 0000000..1f1ed9d
--- /dev/null
+++ b/src/llvmopencl/config.h
@@ -0,0 +1 @@
+// Empty on purpose. Satifies includes from other files.
diff --git a/src/llvmopencl/pocl.h b/src/llvmopencl/pocl.h
new file mode 100644
index 0000000..ae6a66d
--- /dev/null
+++ b/src/llvmopencl/pocl.h
@@ -0,0 +1,49 @@
+/* pocl.h - global pocl declarations.
+
+ Copyright (c) 2011 Universidad Rey Juan Carlos
+ 2011-2014 Pekka Jääskeläinen / Tampere University of Technology
+ Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/**
+ * @file pocl.h
+ *
+ * The declarations in this file are such that are used both in the
+ * libpocl implementation CL and the kernel compiler. Others should be
+ * moved to pocl_cl.h of lib/CL or under the kernel compiler dir.
+ * @todo Check if there are extra declarations here that could be moved.
+ */
+#ifndef POCL_H
+#define POCL_H
+
+/*
+ * During pocl kernel compiler transformations we use the fixed address
+ * space ids of clang's -ffake-address-space-map to mark the different
+ * address spaces to keep the processing target-independent. These
+ * are converted to the target's address space map (if any), in a final
+ * kernel compiler pass.
+ */
+#define POCL_ADDRESS_SPACE_PRIVATE 0
+#define POCL_ADDRESS_SPACE_GLOBAL 1
+#define POCL_ADDRESS_SPACE_LOCAL 2
+#define POCL_ADDRESS_SPACE_CONSTANT 3
+
+#endif /* POCL_H */
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
new file mode 100644
index 0000000..f3d34ab
--- /dev/null
+++ b/src/runtime/CMakeLists.txt
@@ -0,0 +1,59 @@
+# If building for ARM target host then set appropriate clang target
+# Needs to match what's used when using clang to build the kernel
+# See compiler.cpp
+if (HAWKING_BUILD)
+ set(HOST_TARGET -target spir-unknown-unknown-unknown)
+endif()
+
+# If Shamrock build, then we use the builtins.lib built in ../builtins
+if (SHAMROCK_BUILD)
+add_custom_command(
+ OUTPUT stdlib.c.bc.embed.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/../builtins/builtins.lib)
+
+add_custom_target(generate_stdlib_c DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h)
+# otherwise, this stdlib.c is still being used (but is empty)
+else (SHAMROCK_BUILD)
+ set(CUSTOM_COMMAND
+ ${CLANG_EXECUTABLE} -c -emit-llvm -x cl -O2 ${HOST_TARGET} -nostdinc -fno-builtin)
+
+add_custom_command(
+ OUTPUT stdlib.c.bc
+ COMMAND ${CUSTOM_COMMAND}
+ -I${OCL_BUILTINS_DIR}/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c
+ -I${CMAKE_CURRENT_BINARY_DIR}
+ -o ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/stdlib.c
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h)
+
+add_custom_command(
+ OUTPUT stdlib.c.bc.embed.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/embed.py
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc)
+
+add_custom_target(generate_stdlib_c DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib.c.bc.embed.h)
+
+add_custom_command(
+ OUTPUT builtins_def.h stdlib_def.h builtins_impl.h stdlib_impl.h
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py
+ ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def
+ ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/builtins.py
+ ${CMAKE_CURRENT_SOURCE_DIR}/builtins.def)
+
+add_custom_target(generate_builtins DEPENDS
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/builtins_impl.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_def.h
+ ${CMAKE_CURRENT_BINARY_DIR}/stdlib_impl.h)
+endif(SHAMROCK_BUILD)
diff --git a/src/runtime/builtins.def b/src/runtime/builtins.def
new file mode 100644
index 0000000..b94807b
--- /dev/null
+++ b/src/runtime/builtins.def
@@ -0,0 +1,301 @@
+def vecf : float2 float3 float4 float8 float16
+def veci : int2 int3 int4 int8 int16
+
+def vec : $vecf $veci
+def gentype : float $vecf
+
+// gentype acos(gentype)
+// REPL is defined in src/core/cpu/builtins.cpp
+//native float acos float : x:float
+ //return std::acos(x);
+//end
+
+//native $type acos $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::acos(x[i]);
+//end
+
+// gentype acosh(gentype)
+//native float acosh float : x:float
+ //return boost::math::acosh(x);
+//end
+
+//native $type acosh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::acosh(x[i]);
+//end
+
+// gentype acospi(gentype)
+//func float acospi float : x:float
+ //return acos(x) / M_PI;
+//end
+
+//native $type acospi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::acos(x[i]) / M_PI;
+//end
+
+// gentype asin (gentype)
+//native float asin float : x:float
+ //return std::asin(x);
+//end
+
+//native $type asin $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::asin(x[i]);
+//end
+
+// gentype asinh (gentype)
+//native float asinh float : x:float
+ //return boost::math::asinh(x);
+//end
+
+//native $type asinh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::asinh(x[i]);
+//end
+
+// gentype asinpi (gentype x)
+//func float asinpi float : x:float
+ //return asin(x) / M_PI;
+//end
+
+//native $type asinpi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::asin(x[i]) / M_PI;
+//end
+
+// gentype atan (gentype y_over_x)
+//native float atan float : y_over_x:float
+ //return std::atan(y_over_x);
+//end
+
+//native $type atan $vecf : y_over_x:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y_over_x[i]);
+//end
+
+// gentype atan2 (gentype y, gentype x)
+//func float atan2 float : x:float y:float
+ //return atan(y / x);
+//end
+
+//native $type atan2 $vecf : x:$type y:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y[i] / x[i]);
+//end
+
+// gentype atanh (gentype)
+//native float atanh float : x:float
+ //return boost::math::atanh(x);
+//end
+
+//native $type atanh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::atanh(x[i]);
+//end
+
+// gentype atanpi (gentype x)
+//func float atanpi float : x:float
+ //return atan(x) / M_PI;
+//end
+
+//native $type atanpi $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(x[i]) / M_PI;
+//end
+
+// gentype atan2pi (gentype y, gentype x)
+//func float atan2pi float : x:float y:float
+ //return atan2(y, x) / M_PI;
+//end
+//
+//native $type atan2pi $vecf : x:$type y:$type
+ //REPL($vecdim)
+ //result[i] = std::atan(y[i] / x[i]) / M_PI;
+//end
+
+// gentype cbrt (gentype)
+//native float cbrt float : x:float
+ //return boost::math::cbrt(x);
+//end
+//
+//native $type cbrt $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::cbrt(x[i]);
+//end
+
+// gentype ceil (gentype)
+//native float ceil float : x:float
+ //return std::ceil(x);
+//end
+//
+//native $type ceil $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::ceil(x[i]);
+//end
+
+// gentype copysign (gentype x, gentype y)
+//func $type copysign $gentype : x:$type y:$type
+ //return (
+ //(x < 0.0f & y > 0.0f) |
+ //(x > 0.0f & y < 0.0f)
+ //? -x : x);
+//end
+
+//gentype cos (gentype)
+//native float cos float : x:float
+ //return std::cos(x);
+//end
+
+//native $type cos $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::cos(x[i]);
+//end
+
+// gentype cosh (gentype)
+//native float cosh float : x:float
+ //return std::cosh(x);
+//end
+
+//native $type cosh $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::cosh(x[i]);
+//end
+
+// gentype cospi (gentype x)
+//func $type cospi $gentype : x:$type
+ //return cos(x * (float)M_PI);
+//end
+
+// TODO: gentype erfc (gentype)
+// TODO: gentype erf (gentype)
+
+// gentype exp(gentype x)
+//native float exp float : x:float
+ //return std::exp(x);
+//end
+//
+//native $type exp $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::exp(x[i]);
+//end
+//
+// gentype exp2(gentype x)
+//native float exp2 float : x:float
+ //return exp2f(x);
+//end
+//
+//native $type exp2 $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = exp2f(x[i]);
+//end
+//
+//// gentype exp10(gentype x)
+//native float exp10 float : x:float
+ //return exp10f(x);
+//end
+//
+//native $type exp10 $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = exp10f(x[i]);
+//end
+//
+//// gentype expm1(gentype x)
+//func $type expm1 $gentype : x:$type
+ //return exp(x) - 1.0f;
+//end
+//
+//// gentype fdim(x, y)
+//func $type fdim $gentype : x:$type y:$type
+ //return (x > y ? x - y : 0.0f);
+//end
+//
+// gentype floor(gentype x) (TODO: SSE fast path : float->int->float)
+//native float floor float : x:float
+ //return std::floor(x);
+//end
+//
+//native $type floor $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::floor(x[i]);
+//end
+//
+//// gentype fma(a, b, c) : a*b + c (TODO)
+//func $type fma $gentype : a:$type b:$type c:$type
+ //return (a * b) + c;
+//end
+//
+//// gentype trunc(x)
+//native float trunc float : x:float
+ //return boost::math::trunc(x);
+//end
+//
+//native $type trunc $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = boost::math::trunc(x[i]);
+//end
+//
+//// gentype fmod(x, y)
+//func $type fmod $gentype : x:$type y:$type
+ //return x - y * trunc(x / y);
+//end
+//
+// gentype fract(gentype x, gentype *iptr)
+//func $type fract $gentype : x:$type iptr:*$type
+ //*iptr = floor(x);
+ //return fmin(x - *iptr, 0x1.fffffep-1f);
+//end
+
+// gentype frexp(gentype x, intn *exp)
+//native float frexp float : x:float exp:*int
+ //return std::frexp(x, exp);
+//end
+//
+//native $type frexp $vecf : x:$type exp:*int$vecdim
+ //REPL($vecdim)
+ //result[i] = std::frexp(x[i], &exp[i]);
+//end
+//
+//// gentype sqrt(gentype x)
+//native float sqrt float : x:float
+ //return std::sqrt(x);
+//end
+//
+//native double sqrt double : x:double
+ //return std::sqrt(x);
+//end
+//
+//native double log double : x:double
+ //return std::log(x);
+//end
+//
+//native $type sqrt $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = std::sqrt(x[i]);
+//end
+//
+//// gentype hypot(gentype x, gentype y)
+//func $type hypot $gentype : x:$type y:$type
+ //return sqrt(x*x + y*y);
+//end
+
+// intn ilogb(gentype x)
+//native int ilogb float : x:float
+ //return ilogb(x);
+//end
+
+//native int$vecdim ilogb $vecf : x:$type
+ //REPL($vecdim)
+ //result[i] = ilogb(x[i]);
+//end
+
+// gentype ldexp(gentype x, intn n)
+//native float ldexp float : x:float n:int
+ //return std::ldexp(x, n);
+//end
+
+//native $type ldexp $vecf : x:$type n:int$vecdim
+ //REPL($vecdim)
+ //result[i] = std::ldexp(x[i], n[i]);
+//end
diff --git a/src/runtime/builtins.py b/src/runtime/builtins.py
new file mode 100755
index 0000000..909fee8
--- /dev/null
+++ b/src/runtime/builtins.py
@@ -0,0 +1,380 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+# Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# builtins.py <def> <outdir>
+
+import sys
+
+class Function:
+ class Arg:
+ def __init__(self, name, t):
+ self.name = name
+ self.t = t
+
+ KIND_BUILTINS_IMPL = 0 # static function in builtins.cpp
+ KIND_BUILTINS_DEF = 1 # if (name == '__cpu_$name') return (void *)&name;
+ KIND_STDLIB_IMPL = 2 # OpenCL C function in stdlib.c
+ KIND_STDLIB_DEF = 3 # Header in stdlib.h
+ KIND_STDLIB_STUB = 4 # OpenCL C stub in stdlib.c: calls __cpu_$name
+ KIND_STDLIB_STUB_DEF = 5 # __cpu_$name declared in stdlib.c
+
+ def __init__(self, name, native):
+ self.name = name
+ self.native = native
+
+ self.args = [] # Array <Arg>
+ self.types = [] # Array <str>
+ self.return_type = ''
+ self.body = ''
+
+ def set_return_type(self, ty):
+ self.return_type = ty
+
+ def append_body(self, body):
+ self.body += body
+
+ def add_arg(self, name, ty):
+ self.args.append(self.Arg(name, ty))
+
+ def add_type(self, ty):
+ self.types.append(ty)
+
+ def mangled_name(self, current_type):
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ rs = return_type + '_' + self.name
+ first = True
+
+ for a in self.args:
+ if first:
+ rs += '_'
+ first = False
+
+ arg_type = self.process_type_name(current_type, a.t)
+ rs += arg_type.replace('*', 'p')
+
+ return rs
+
+ def process_type_name(self, current_type, type_name):
+ # Current vector dimension
+ vecdim = '1'
+
+ if current_type[-1].isdigit():
+ if current_type[-2].isdigit():
+ vecdim = current_type[-2:]
+ else:
+ vecdim = current_type[-1]
+
+ # $vecdim expansion
+ return type_name.replace('$vecdim', vecdim).replace('$type', current_type)
+
+ def arg_list(self, current_type, handle_first_arg):
+ rs = ''
+ first = True
+ append_arg = None
+
+ # We may need a first "result" arg
+ if handle_first_arg:
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ if return_type[-1].isdigit():
+ # Return is a vector
+ append_arg = self.Arg('result', return_type)
+
+ if append_arg:
+ args = [append_arg] + self.args
+ else:
+ args = self.args
+
+ for arg in args:
+ # Resolve type
+ arg_type = self.process_type_name(current_type, arg.t)
+
+ if arg_type[0] == '*':
+ arg_ptr = True
+ arg_type = arg_type[1:]
+ else:
+ arg_ptr = False
+
+ # We need to pass vector arguments as pointers
+ arg_vector = False
+ if handle_first_arg:
+ arg_vector = arg_type[-1].isdigit()
+ arg_type = arg_type.rstrip('0123456789')
+
+ # Build the string
+ if not first:
+ rs += ', '
+ first = False
+
+ rs += arg_type + ' '
+
+ if arg_vector or arg_ptr:
+ rs += '*'
+
+ rs += arg.name
+
+ return rs
+
+ def write(self, current_type, kind):
+ # Template:
+ # (static) $ret_type $name($args) {
+ # $body
+ # }
+ rs = ''
+
+ if kind == self.KIND_BUILTINS_IMPL:
+ rs = 'static '
+ elif kind == self.KIND_BUILTINS_DEF:
+ rs += ' else if (name == "__cpu_' + self.mangled_name(current_type) + '")\n'
+ rs += ' return (void *)&' + self.mangled_name(current_type) + ';\n'
+ return rs
+
+ # Calculate return type
+ return_type = self.process_type_name(current_type, self.return_type)
+
+ if (kind == self.KIND_BUILTINS_IMPL or kind == self.KIND_STDLIB_STUB_DEF) \
+ and return_type[-1].isdigit():
+ return_type = 'void' # We'll use a 'result' argument
+
+ rs += return_type + ' '
+
+ # Append mangled name if needed
+ if kind == self.KIND_BUILTINS_IMPL:
+ rs += self.mangled_name(current_type)
+ elif kind == self.KIND_STDLIB_STUB_DEF:
+ rs += '__cpu_' + self.mangled_name(current_type)
+ else:
+ # No need to mangle the name, but add OVERLOAD
+ rs += '_CLC_OVERLOAD ' + self.name
+
+ # Print function args
+ rs += '('
+ rs += self.arg_list(current_type, kind == self.KIND_BUILTINS_IMPL or \
+ kind == self.KIND_STDLIB_STUB_DEF)
+ rs += ')'
+
+ # If only a declaration, end it
+ if kind == self.KIND_STDLIB_DEF or kind == self.KIND_STDLIB_STUB_DEF:
+ rs += ';\n'
+ return rs
+
+ # Add the body
+ rs += '\n{\n'
+
+ if kind == self.KIND_STDLIB_STUB:
+ # Special body : call __cpu_$name
+ return_is_vector = return_type[-1].isdigit()
+ if return_is_vector:
+ # Need to create a temporary
+ rs += ' ' + return_type + ' result;\n'
+ rs += '\n'
+
+ # Call the cpu stub
+ rs += ' '
+ if not return_is_vector:
+ rs += 'return '
+
+ rs += '__cpu_' + self.mangled_name(current_type) + '('
+
+ # Pass the result if needed
+ first = True
+ if return_is_vector:
+ rs += '(' + return_type.rstrip('0123456789') + ' *)&result'
+ first = False
+
+ # Append the args
+ for arg in self.args:
+ # Resolve type
+ arg_type = self.process_type_name(current_type, arg.t)
+
+ arg_ptr = False
+ if arg_type[0] == '*':
+ arg_type = arg_type[1:]
+ arg_ptr = True
+
+ arg_vector = arg_type[-1].isdigit()
+
+ if not first:
+ rs += ', '
+ first = False
+
+ # We need to pass vector arguments as pointers
+ if arg_vector:
+ rs += '(' + arg_type.rstrip('0123456789') + ' *)'
+ if not arg_ptr:
+ rs += '&'
+
+ rs += arg.name
+
+ # End the call
+ rs += ');\n'
+
+ if return_is_vector:
+ rs += '\n return result;\n'
+
+ rs += '}\n\n'
+ else:
+ # Simply copy the body
+ vecdim = '1'
+
+ if current_type[-1].isdigit():
+ if current_type[-2].isdigit():
+ vecdim = current_type[-2:]
+ else:
+ vecdim = current_type[-1]
+
+ rs += self.body.replace('$type', current_type) \
+ .replace('$vecdim', vecdim)
+ rs += '\n}\n\n'
+
+ return rs
+
+class Generator:
+ builtins_impl_file = 'builtins_impl.h' # static functions
+ builtins_def_file = 'builtins_def.h' # if () in getBuiltin
+ stdlib_impl_file = 'stdlib_impl.h' # stdlib.c functions
+ stdlib_def_file = 'stdlib_def.h' # stdlib.h definitions
+
+ def __init__(self, out_path):
+ self.out_path = out_path
+
+ # Buffers
+ self.builtins_impl_buffer = ''
+ self.builtins_def_buffer = ''
+ self.stdlib_impl_buffer = ''
+ self.stdlib_def_buffer = ''
+
+ def add_function(self, function):
+ for t in function.types:
+ if function.native:
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB_DEF)
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_STUB)
+ self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF)
+ self.builtins_impl_buffer += function.write(t, function.KIND_BUILTINS_IMPL)
+ self.builtins_def_buffer += function.write(t, function.KIND_BUILTINS_DEF)
+ else:
+ self.stdlib_def_buffer += function.write(t, function.KIND_STDLIB_DEF)
+ self.stdlib_impl_buffer += function.write(t, function.KIND_STDLIB_IMPL)
+
+ def write(self):
+ of = open(self.out_path + '/' + self.stdlib_def_file, 'w')
+ of.write(self.stdlib_def_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.stdlib_impl_file, 'w')
+ of.write(self.stdlib_impl_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.builtins_def_file, 'w')
+ of.write(self.builtins_def_buffer)
+ of.close()
+
+ of = open(self.out_path + '/' + self.builtins_impl_file, 'w')
+ of.write(self.builtins_impl_buffer)
+ of.close()
+
+class Parser:
+ def __init__(self, generator, def_file_name):
+ self.generator = generator
+ self.def_file_name = def_file_name
+
+ self.defs = {}
+
+ def replace_variable(self, token):
+ result = []
+
+ if token[0] == '$':
+ for tok in self.defs[token[1:]]:
+ result.extend(self.replace_variable(tok))
+ else:
+ result.append(token)
+
+ return result
+
+ def parse(self):
+ def_file = open(self.def_file_name, 'rb')
+ current_function = None
+
+ for line in def_file:
+ if current_function:
+ # End if we encounter an end
+ if line.startswith('end'):
+ self.generator.add_function(current_function)
+ current_function = None
+ else:
+ # Add a line to the body
+ current_function.append_body(line)
+ else:
+ line = line.strip()
+ tokens = line.split(' ')
+ tok = tokens[0]
+
+ if tok == 'def':
+ # A definition : def <variable> : [values]
+ name = tokens[1]
+ values = []
+
+ for token in tokens[3:]:
+ values.extend(self.replace_variable(token))
+
+ self.defs[name] = values
+ elif tok == 'func' or tok == 'native':
+ # Function : func|native <ret_type> <name> [types] : [args]
+ current_function = Function(tokens[2], \
+ tokens[0] == 'native')
+
+ current_function.set_return_type(tokens[1])
+
+ # Explore the types and args
+ in_types = True
+
+ for token in tokens[3:]:
+ if token == ':':
+ in_types = False
+ elif in_types:
+ for ty in self.replace_variable(token):
+ current_function.add_type(ty)
+ else:
+ # Parameters
+ parts = token.split(':')
+ current_function.add_arg(parts[0], parts[1])
+
+ def_file.close()
+
+if __name__ == '__main__':
+ def_file = sys.argv[1]
+ out_dir = sys.argv[2]
+
+ gen = Generator(out_dir)
+ parser = Parser(gen, def_file)
+
+ parser.parse()
+ gen.write()
diff --git a/src/runtime/embed.py b/src/runtime/embed.py
new file mode 100755
index 0000000..e3aca9d
--- /dev/null
+++ b/src/runtime/embed.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+# #!/usr/local/bin/python2.6-2.6.4
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# embed.py <outfile> <filenames..>
+# <filenames> => <outfile>
+
+import sys
+
+outfile = open(sys.argv[1], 'w')
+name = sys.argv[1].split('/')[-1].replace('.embed.h', '').replace('.', '_')
+
+data = ''
+
+for i in xrange(len(sys.argv) - 1):
+ infile = open(sys.argv[i + 1], 'rb')
+ data += infile.read()
+
+# Header
+outfile.write('#ifndef __%s__\n' % name.upper())
+outfile.write('#define __%s__\n' % name.upper())
+outfile.write('\n')
+outfile.write('const char embed_%s[] =\n' % name)
+
+# Write it in chunks of 80 chars :
+# | "\x00..." (4+1+1 + 4*chars ==> chars = 18)
+index = 0
+
+for c in data:
+ if index == 0:
+ outfile.write(' "')
+
+ outfile.write('\\x%s' % ('%x' % ord(c)).rjust(2, '0'))
+ index += 1
+
+ if index == 18:
+ index = 0
+ outfile.write('"\n')
+
+# We may need to terminate a line
+if index != 0:
+ outfile.write('";\n')
+else:
+ outfile.write(';\n') # Alone on its line, poor semicolon
+
+# Footer
+outfile.write('\n')
+outfile.write('#endif\n')
+
+infile.close()
+outfile.close()
diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c
new file mode 100644
index 0000000..9b115df
--- /dev/null
+++ b/src/runtime/stdlib.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * Copyright (c) 2012-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+int debug(const char *format, ...);
+
+/* WARNING: Due to some device-specific things in stdlib.h, the bitcode stdlib
+ * must only be used by CPUDevice, as it's targeted to the host CPU at Clover's
+ * compilation! */
+
+/*
+ * Built-in functions generated by src/runtime/builtins.py
+ */
+
+#include <clc.h>
+#include <stdlib_impl.h>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..3b4175f
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,30 @@
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/include ${CHECK_INCLUDE_DIRS})
+LINK_DIRECTORIES(${PROJECT_BINARY_DIR}/src ${CHECK_LIBRARY_DIRS})
+
+set(OPENCL_TESTS_SOURCE
+ tests.c
+ test_platform.cpp
+ test_device.cpp
+ test_context.cpp
+ test_commandqueue.cpp
+ test_mem.cpp
+ test_kernel.cpp
+ test_program.cpp
+ test_builtins.cpp
+ )
+
+add_executable(tests ${OPENCL_TESTS_SOURCE})
+target_link_libraries(tests OpenCL z ${CHECK_LIBRARIES} pthread)
+
+MACRO(OPENCL_TEST EXECUTABLE_NAME TEST_NAME)
+ add_test(${TEST_NAME} ${EXECUTABLE_NAME} ${TEST_NAME})
+ENDMACRO(OPENCL_TEST)
+
+OPENCL_TEST(tests platform)
+OPENCL_TEST(tests device)
+OPENCL_TEST(tests context)
+OPENCL_TEST(tests commandqueue)
+OPENCL_TEST(tests mem)
+OPENCL_TEST(tests kernel)
+OPENCL_TEST(tests program)
+OPENCL_TEST(tests builtins)
diff --git a/tests/basic_test_failures.lst b/tests/basic_test_failures.lst
new file mode 100644
index 0000000..d5e4871
--- /dev/null
+++ b/tests/basic_test_failures.lst
@@ -0,0 +1,412 @@
+Khronos Basic Test Failures
+===========================
+Khronos Test Version: OpenCL 1.1: April 4, 2010.
+
+Usage:
+% cd opencl_conformance/test_conformance/basic
+% test_basic <test-name>
+
+
+<test-name>:
+Failure Mode:
+Analysis:
+
+hiloeo
+======
+Failure Mode:
+------------
+Runs out of system memory, and crashes the test.
+However, the test is passing all of the subtests before it crashes.
+
+Analysis:
+--------
+valgrind analysis on shamrock showed huge memory leaks around creating and
+deleting programs, which were due to LLVM objects not getting freed. This
+could either be a usage problem, or a bug in LLVM MCJIT execution engine.
+
+
+async_copy_global_to_local.txt
+async_copy_local_to_global.txt
+async_strided_copy_global_to_local.txt
+async_strided_copy_local_to_global.txt
+======================================
+Failure Mode:
+------------
+All of the above 4 tests fail in the same way: Due to the Khronos generated
+CL file not being able to compile. These also fail the same way on
+Keystone EVM (which doesn't use MCJIT).
+
+async_copy_global_to_local...
+Testing char
+program.cl:9:153: error: used type 'event_t' where arithmetic or pointer type is required
+
+ERROR: clBuildProgram failed! (CL_BUILD_PROGRAM_FAILURE from /home/gpitney/opencl_conformance/test_common/harness/kernelHelpers.c:35)
+Original source is: ------------
+
+__kernel void test_fn( const __global char *src, __global char *dst, __local char *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
+{
+ int i;
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (char)(char)0;
+ barrier( CLK_LOCAL_MEM_FENCE );
+ event_t event;
+ event = async_work_group_copy( (__local char*)localBuffer, (__global const char*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 );
+ wait_group_events( 1, &event );
+ for(i=0; i<copiesPerWorkItem; i++)
+ dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];
+}
+Build not successful for device "Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz", status: CL_BUILD_ERROR
+Build log for device "Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz" is: ------------
+program.cl:9:153: error: used type 'event_t' where arithmetic or pointer type is required
+
+Analysis:
+--------
+Note the cast of (event_t)0 in the kernel above.
+Per the discussion here: http://comments.gmane.org/gmane.comp.compilers.clang.scm/93008 , it appears the spec is vague on this point, but the Khronos
+test nevertheless expects the cast to compile.
+
+So, it seems a clang patch for OpenCL event_t casts of zero may be required.
+
+kernel_memory_alignment_constant.txt
+====================================
+Failure Mode:
+------------
+This fails due to inability to compile a Khronos test generated CL program.
+
+
+kernel_memory_alignment_constant...
+Device version string: "OpenCL 1.1 "
+Testing char...
+ Testing parameter kernel...
+ Testing constant kernel...
+program.cl:2:17: error: variable in constant address space must be initialized
+program.cl:3:18: error: variable in constant address space must be initialized
+program.cl:4:18: error: variable in constant address space must be initialized
+program.cl:5:18: error: variable in constant address space must be initialized
+program.cl:6:18: error: variable in constant address space must be initialized
+program.cl:7:19: error: variable in constant address space must be initialized
+
+ERROR: clBuildProgram failed! (CL_BUILD_PROGRAM_FAILURE from /home/gpitney/opencl_conformance/test_common/harness/kernelHelpers.c:35)
+Original source is: ------------
+
+ constant char mem0[3];
+ constant char2 mem2[3];
+ constant char3 mem3[3];
+ constant char4 mem4[3];
+ constant char8 mem8[3];
+ constant char16 mem16[3];
+
+kernel void test(global ulong *results)
+{
+ results[0] = (ulong)&mem0;
+ results[1] = (ulong)&mem2;
+ results[2] = (ulong)&mem3;
+ results[3] = (ulong)&mem4;
+ results[4] = (ulong)&mem8;
+ results[5] = (ulong)&mem16;
+}
+
+
+Analysis:
+--------
+Interestingly, this generated CL code compiles on Keystone
+(not using MCJIT, using LLVM 3.3), and the test passes.
+
+Some digging shows this clang error was added after LLVM 3.3
+(LLVM version used by TI Keystone, which explains why it passes there):
+http://lists.cs.uiuc.edu/pipermail/cfe-commits/Week-of-Mon-20131230/096405.html
+
+In this case, the LLVM clang compiler and the Khronos tests are in conflict.
+
+local_kernel_def.txt
+====================
+Failure Mode:
+------------
+This fails due to inability to compile a Khronos test generated CL program.
+
+
+local_kernel_def...
+program.cl:3:23: error: 'tmp_sum' declared as an array with a negative size
+
+ERROR: clBuildProgram failed! (CL_BUILD_PROGRAM_FAILURE from /home/gpitney/opencl_conformance/test_common/harness/kernelHelpers.c:35)
+Original source is: ------------
+__kernel void compute_sum_with_localmem(__global int *a, int n, __global int *sum)
+{
+ __local int tmp_sum[-2147483648];
+ int tid = get_local_id(0);
+ int lsize = get_local_size(0);
+ int i;
+
+[... snip ...]
+
+Analysis:
+--------
+This test also fails on Keystone, but the negative number is (-4).
+
+The Khronos test is casting a size_t value for work group size to an int,
+and printing it into the kernel string using the %d printf() modifier.
+This does not appear to be the right printf() modifier for a size_t, so
+the test code appears to be in error.
+
+parameter_types
+=================
+Failure Mode:
+------------
+Invalid results results returned from test-generated OCL kernel, which uses
+vector parameters of various sizes.
+
+[ ... snip ...]
+Testing vector size 4
+Kernel: __kernel void test_kernel(
+char4 c, uchar4 uc, short4 s, ushort4 us, int4 i, uint4 ui, float4 f,
+__global float4 *result)
+{
+ result[0] = convert_float4(c);
+ result[1] = convert_float4(uc);
+ result[2] = convert_float4(s);
+ result[3] = convert_float4(us);
+ result[4] = convert_float4(i);
+ result[5] = convert_float4(ui);
+ result[6] = f;
+}
+
+Conversion from char4 failed: index 0 got 4.28107e-38, expected 0.
+Conversion from char4 failed: index 2 got 16, expected 2.
+Conversion from char4 failed: index 3 got 1, expected -3.
+Conversion from uchar4 failed: index 0 got 4.28107e-38, expected 16.
+Conversion from uchar4 failed: index 1 got -1, expected 1.
+Conversion from uchar4 failed: index 2 got 18, expected 2.
+Conversion from uchar4 failed: index 3 got 1, expected 3.
+Conversion from short4 failed: index 0 got -19, expected -17.
+Conversion from short4 failed: index 2 got 20, expected 2.
+Conversion from short4 failed: index 3 got 1, expected -3.
+Conversion from ushort4 failed: index 0 got -23, expected 18.
+Conversion from ushort4 failed: index 1 got -1, expected 1.
+Conversion from ushort4 failed: index 2 got 0, expected 2.
+Conversion from ushort4 failed: index 3 got 0, expected 3.
+Conversion from int4 failed: index 0 got 0, expected -19.
+Conversion from int4 failed: index 1 got 0, expected -1.
+Conversion from int4 failed: index 2 got 0, expected 2.
+Conversion from int4 failed: index 3 got 0, expected -3.
+Conversion from uint4 failed: index 0 got 0, expected 20.
+Conversion from uint4 failed: index 1 got 0, expected 1.
+Conversion from uint4 failed: index 2 got 0, expected 2.
+Conversion from uint4 failed: index 3 got 0, expected 3.
+Conversion from float4 failed: index 0 got 0, expected -23.
+Conversion from float4 failed: index 1 got 0, expected -1.
+Conversion from float4 failed: index 2 got 0, expected 2.
+Conversion from float4 failed: index 3 got 0, expected -3.
+Testing vector size 8
+Kernel: __kernel void test_kernel(
+char8 c, uchar8 uc, short8 s, ushort8 us, int8 i, uint8 ui, float8 f,
+__global float8 *result)
+{
+ result[0] = convert_float8(c);
+ result[1] = convert_float8(uc);
+ result[2] = convert_float8(s);
+ result[3] = convert_float8(us);
+ result[4] = convert_float8(i);
+ result[5] = convert_float8(ui);
+ result[6] = f;
+}
+
+Conversion from char8 failed: index 0 got -5.99946e-08, expected 0.
+Conversion from char8 failed: index 2 got 16, expected 2.
+Conversion from char8 failed: index 3 got 1, expected -3.
+Conversion from char8 failed: index 4 got 4.28106e-38, expected 4.
+Conversion from char8 failed: index 5 got -1, expected -5.
+Conversion from char8 failed: index 6 got 18, expected 6.
+Conversion from char8 failed: index 7 got 1, expected -7.
+Conversion from uchar8 failed: index 0 got -19, expected 16.
+Conversion from uchar8 failed: index 1 got -1, expected 1.
+Conversion from uchar8 failed: index 2 got 20, expected 2.
+Conversion from uchar8 failed: index 3 got 1, expected 3.
+Conversion from uchar8 failed: index 4 got -5.99946e-08, expected 4.
+Conversion from uchar8 failed: index 5 got -1, expected 5.
+Conversion from uchar8 failed: index 6 got 0, expected 6.
+Conversion from uchar8 failed: index 7 got 0, expected 7.
+Conversion from short8 failed: index 0 got 0, expected -17.
+Conversion from short8 failed: index 1 got 0, expected -1.
+Conversion from short8 failed: index 2 got 0, expected 2.
+Conversion from short8 failed: index 3 got 0, expected -3.
+Conversion from short8 failed: index 4 got 0, expected 4.
+Conversion from short8 failed: index 5 got 0, expected -5.
+Conversion from short8 failed: index 6 got 0, expected 6.
+Conversion from short8 failed: index 7 got 0, expected -7.
+Conversion from ushort8 failed: index 0 got 0, expected 18.
+Conversion from ushort8 failed: index 1 got 0, expected 1.
+Conversion from ushort8 failed: index 2 got 0, expected 2.
+Conversion from ushort8 failed: index 3 got 0, expected 3.
+Conversion from ushort8 failed: index 4 got 0, expected 4.
+Conversion from ushort8 failed: index 5 got 0, expected 5.
+Conversion from ushort8 failed: index 6 got 0, expected 6.
+Conversion from ushort8 failed: index 7 got 0, expected 7.
+Conversion from int8 failed: index 0 got 0, expected -19.
+Conversion from int8 failed: index 1 got 0, expected -1.
+Conversion from int8 failed: index 2 got 0, expected 2.
+Conversion from int8 failed: index 3 got 0, expected -3.
+Conversion from int8 failed: index 4 got 0, expected 4.
+Conversion from int8 failed: index 5 got 0, expected -5.
+Conversion from int8 failed: index 6 got 0, expected 6.
+Conversion from int8 failed: index 7 got 0, expected -7.
+Conversion from uint8 failed: index 0 got 0, expected 20.
+Conversion from uint8 failed: index 1 got 0, expected 1.
+Conversion from uint8 failed: index 2 got 0, expected 2.
+Conversion from uint8 failed: index 3 got 0, expected 3.
+Conversion from uint8 failed: index 4 got 0, expected 4.
+Conversion from uint8 failed: index 5 got 0, expected 5.
+Conversion from uint8 failed: index 6 got 0, expected 6.
+Conversion from uint8 failed: index 7 got 0, expected 7.
+Conversion from float8 failed: index 0 got 0, expected -23.
+Conversion from float8 failed: index 1 got 0, expected -1.
+Conversion from float8 failed: index 2 got 0, expected 2.
+Conversion from float8 failed: index 3 got 0, expected -3.
+Conversion from float8 failed: index 4 got 0, expected 4.
+Conversion from float8 failed: index 5 got 0, expected -5.
+Conversion from float8 failed: index 6 got 0, expected 6.
+Conversion from float8 failed: index 7 got 0, expected -7.
+Testing vector size 16
+Kernel: __kernel void test_kernel(
+char16 c, uchar16 uc, short16 s, ushort16 us, int16 i, uint16 ui, float16 f,
+__global float16 *result)
+{
+ result[0] = convert_float16(c);
+ result[1] = convert_float16(uc);
+ result[2] = convert_float16(s);
+ result[3] = convert_float16(us);
+ result[4] = convert_float16(i);
+ result[5] = convert_float16(ui);
+ result[6] = f;
+}
+
+Conversion from char16 failed: index 0 got -7.22404e-06, expected 0.
+Conversion from char16 failed: index 2 got 16, expected 2.
+Conversion from char16 failed: index 3 got 1, expected -3.
+Conversion from char16 failed: index 4 got -3.96717e-07, expected 4.
+Conversion from char16 failed: index 5 got -1, expected -5.
+Conversion from char16 failed: index 6 got 18, expected 6.
+Conversion from char16 failed: index 7 got 1, expected -7.
+Conversion from char16 failed: index 8 got 0, expected 8.
+Conversion from char16 failed: index 9 got -1, expected -9.
+Conversion from char16 failed: index 10 got 20, expected 10.
+Conversion from char16 failed: index 11 got 1, expected -11.
+Conversion from char16 failed: index 12 got 4.28106e-38, expected 12.
+Conversion from char16 failed: index 13 got -1, expected -13.
+Conversion from char16 failed: index 14 got 0, expected 14.
+Conversion from char16 failed: index 15 got 0, expected -15.
+Conversion from uchar16 failed: index 0 got 0, expected 16.
+Conversion from uchar16 failed: index 1 got 0, expected 1.
+Conversion from uchar16 failed: index 2 got 0, expected 2.
+Conversion from uchar16 failed: index 3 got 0, expected 3.
+Conversion from uchar16 failed: index 4 got 0, expected 4.
+Conversion from uchar16 failed: index 5 got 0, expected 5.
+Conversion from uchar16 failed: index 6 got 0, expected 6.
+Conversion from uchar16 failed: index 7 got 0, expected 7.
+Conversion from uchar16 failed: index 8 got 0, expected 8.
+Conversion from uchar16 failed: index 9 got 0, expected 9.
+Conversion from uchar16 failed: index 10 got 0, expected 10.
+Conversion from uchar16 failed: index 11 got 0, expected 11.
+Conversion from uchar16 failed: index 12 got -3.96712e-07, expected 12.
+Conversion from uchar16 failed: index 13 got 0, expected 13.
+Conversion from uchar16 failed: index 14 got 0, expected 14.
+Conversion from uchar16 failed: index 15 got 0, expected 15.
+Conversion from short16 failed: index 0 got 0, expected -17.
+Conversion from short16 failed: index 1 got 0, expected -1.
+Conversion from short16 failed: index 2 got 0, expected 2.
+Conversion from short16 failed: index 3 got 0, expected -3.
+Conversion from short16 failed: index 4 got 0, expected 4.
+Conversion from short16 failed: index 5 got 0, expected -5.
+Conversion from short16 failed: index 6 got 0, expected 6.
+Conversion from short16 failed: index 7 got 0, expected -7.
+Conversion from short16 failed: index 8 got 0, expected 8.
+Conversion from short16 failed: index 9 got 0, expected -9.
+Conversion from short16 failed: index 10 got 0, expected 10.
+Conversion from short16 failed: index 11 got 0, expected -11.
+Conversion from short16 failed: index 12 got 0, expected 12.
+Conversion from short16 failed: index 13 got 0, expected -13.
+Conversion from short16 failed: index 14 got 0, expected 14.
+Conversion from short16 failed: index 15 got 0, expected -15.
+Conversion from ushort16 failed: index 0 got 0, expected 18.
+Conversion from ushort16 failed: index 1 got 0, expected 1.
+Conversion from ushort16 failed: index 2 got 0, expected 2.
+Conversion from ushort16 failed: index 3 got 0, expected 3.
+Conversion from ushort16 failed: index 4 got 0, expected 4.
+Conversion from ushort16 failed: index 5 got 0, expected 5.
+Conversion from ushort16 failed: index 6 got 0, expected 6.
+Conversion from ushort16 failed: index 7 got 0, expected 7.
+Conversion from ushort16 failed: index 8 got 0, expected 8.
+Conversion from ushort16 failed: index 9 got 0, expected 9.
+Conversion from ushort16 failed: index 10 got 0, expected 10.
+Conversion from ushort16 failed: index 11 got 0, expected 11.
+Conversion from ushort16 failed: index 12 got 0, expected 12.
+Conversion from ushort16 failed: index 13 got 0, expected 13.
+Conversion from ushort16 failed: index 14 got 0, expected 14.
+Conversion from ushort16 failed: index 15 got 0, expected 15.
+Conversion from int16 failed: index 0 got 0, expected -19.
+Conversion from int16 failed: index 1 got 0, expected -1.
+Conversion from int16 failed: index 2 got 0, expected 2.
+Conversion from int16 failed: index 3 got 0, expected -3.
+Conversion from int16 failed: index 4 got 0, expected 4.
+Conversion from int16 failed: index 5 got 0, expected -5.
+Conversion from int16 failed: index 6 got 0, expected 6.
+Conversion from int16 failed: index 7 got 0, expected -7.
+Conversion from int16 failed: index 8 got 0, expected 8.
+Conversion from int16 failed: index 9 got 0, expected -9.
+Conversion from int16 failed: index 10 got 0, expected 10.
+Conversion from int16 failed: index 11 got 0, expected -11.
+Conversion from int16 failed: index 12 got 0, expected 12.
+Conversion from int16 failed: index 13 got 0, expected -13.
+Conversion from int16 failed: index 14 got 0, expected 14.
+Conversion from int16 failed: index 15 got 0, expected -15.
+Conversion from uint16 failed: index 0 got 0, expected 20.
+Conversion from uint16 failed: index 1 got 0, expected 1.
+Conversion from uint16 failed: index 2 got 0, expected 2.
+Conversion from uint16 failed: index 3 got 0, expected 3.
+Conversion from uint16 failed: index 4 got 0, expected 4.
+Conversion from uint16 failed: index 5 got 0, expected 5.
+Conversion from uint16 failed: index 6 got 0, expected 6.
+Conversion from uint16 failed: index 7 got 0, expected 7.
+Conversion from uint16 failed: index 8 got 0, expected 8.
+Conversion from uint16 failed: index 9 got 0, expected 9.
+Conversion from uint16 failed: index 10 got 0, expected 10.
+Conversion from uint16 failed: index 11 got 0, expected 11.
+Conversion from uint16 failed: index 12 got 0, expected 12.
+Conversion from uint16 failed: index 13 got 0, expected 13.
+Conversion from uint16 failed: index 14 got 0, expected 14.
+Conversion from uint16 failed: index 15 got 0, expected 15.
+
+[ then crashes on next sub-tests]
+
+Analysis:
+--------
+Some rather intense debugging found the culprit being the float<n> *
+results output kernel vector argument was being *modified* by the MCJIT
+generated ARM assembly kernel code! This was determined by gdb debugging
+via assembly into the JIT'd kernel, and also inserting callbacks to
+builtin funcitons to inspect the results pointer argument at entry and exit
+to and from the kernel function.
+
+After creating a simplified test case using lli, was able to reproduce the error
+and fix the issue by modifying the intermediate IR of the test case.
+
+However, the same modifications translated into shamrock did not resolve the issue there.
+
+This issue may be the cause of many of the other basic test failures which
+involve vector parameters used in JIT'd ARM kernels.
+
+These other tests fail due to unexpected results being returned from the JIT'ed
+kernels on ARM:
+
+local_kernel_scope
+explicit_s2v_<type>
+fpmath_float4
+intmath_int4
+intmath_long2
+
+TODO:
+=====
+kernel_memory_alignment_local - clSetKenrelArg failed.
+vload_local - clSetKernelArg failed
+vstore_local - clSetKernelArg failed
+local_arg_def - clCreateBuffer failed.
+
diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp
new file mode 100644
index 0000000..9a6d651
--- /dev/null
+++ b/tests/test_builtins.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <iostream>
+#include <cstdlib>
+
+#include "test_builtins.h"
+#include "CL/cl.h"
+
+#include <stdint.h>
+
+const char sampler_source[] =
+ "__kernel void test_case(__global uint *rs, sampler_t sampler) {\n"
+ " sampler_t good_sampler = CLK_NORMALIZED_COORDS_TRUE |\n"
+ " CLK_ADDRESS_MIRRORED_REPEAT |\n"
+ " CLK_FILTER_NEAREST;\n"
+ "\n"
+ " if (sampler != good_sampler) *rs = 1;\n"
+ "}\n";
+
+const char barrier_source[] =
+ "__kernel void test_case(__global uint *rs) {\n"
+ " *rs = 0;\n"
+ " int i; for (i=0; i<3; i++) barrier(0);\n"
+ " *rs += 1;\n"
+ "}\n";
+
+const char image_source[] =
+ "__kernel void test_case(__global uint *rs, __write_only image2d_t image1,\n"
+ " __write_only image2d_t image2,\n"
+ " __read_only image2d_t image3) {\n"
+ " float4 fcolor;\n"
+ " int4 scolor;\n"
+ " int2 coord;\n"
+ " sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |\n"
+ " CLK_ADDRESS_MIRRORED_REPEAT |\n"
+ " CLK_FILTER_NEAREST;\n"
+ "\n"
+ " if (get_image_width(image1) != 4) *rs = 1;\n"
+ " if (get_image_height(image1) != 4) *rs = 2;\n"
+ " if (get_image_channel_data_type(image2) != CLK_SIGNED_INT16) *rs = 3;\n"
+ " if (get_image_channel_order(image2) != CLK_RGBA) *rs = 4;\n"
+ "\n"
+ " if (*rs != 0) return;\n"
+ "\n"
+ " fcolor.x = 1.0f;\n"
+ " fcolor.y = 0.5f;\n"
+ " fcolor.z = 0.0f;\n"
+ " fcolor.w = 1.0f;\n"
+ "\n"
+ " scolor.x = -3057;\n"
+ " scolor.y = 65;\n"
+ " scolor.z = 0;\n"
+ " scolor.w = 32767;\n"
+ "\n"
+ " coord.x = 3;\n"
+ " coord.y = 1;\n"
+ "\n"
+ " write_imagef(image1, coord, fcolor);\n"
+ " write_imagei(image2, coord, scolor);\n"
+ "\n"
+ " coord.x = 1;\n"
+ " coord.y = 1;\n"
+ " fcolor = read_imagef(image3, 0, coord);\n"
+ " if (fcolor.x < 0.99f || fcolor.y < 0.99f || fcolor.z > 0.01f ||\n"
+ " fcolor.w > 0.01f) { *rs = 5; return; }\n"
+ "\n"
+ " float2 fcoords;\n"
+ " fcoords.x = 0.31f;\n"
+ " fcoords.y = 0.1415f;\n"
+ " fcolor = read_imagef(image3, sampler, fcoords);\n"
+ "}\n";
+
+const char builtins_source[] =
+ "__kernel void test_case(__global uint *rs) {\n"
+ " float2 f2;\n"
+ " float2 f2b;\n"
+ "\n"
+ " f2.x = 1.0f;\n"
+ " f2.y = 0.0f;\n"
+ " f2b.x = -0.5f;\n"
+ " f2b.y = (float)M_PI;\n"
+ "\n"
+ " if (cos(f2).y != 1.0f) { *rs = 1; return; }\n"
+ " if (cos(0.0f) != 1.0f) { *rs = 2; return; }\n"
+ " if (copysign(1.0f, -0.5f) != -1.0f) { *rs = 3; return; }\n"
+ " if (copysign(f2, f2b).x != -1.0f) { *rs = 4; return; }\n"
+ " if (exp2(3.0f) != 8.0f) { *rs = 5; return; }\n"
+ "}\n";
+
+enum TestCaseKind
+{
+ NormalKind,
+ SamplerKind,
+ BarrierKind,
+ ImageKind
+};
+
+/*
+ * To ease testing, each kernel will be a Task kernel taking a pointer to an
+ * integer and running built-in functions. If an error is encountered, the
+ * integer pointed to by the arg will be set accordingly. If the kernel succeeds,
+ * this integer is set to 0.
+ */
+static uint32_t run_kernel(const char *source, TestCaseKind kind)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_program program;
+ cl_int result;
+ cl_kernel kernel;
+ cl_event event;
+ cl_mem rs_buf;
+
+ cl_sampler sampler;
+ cl_mem mem1, mem2, mem3;
+ cl_image_format fmt;
+
+ unsigned char image2d_data[3*3*4] = {
+ 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0,
+ 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0,
+ 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0
+ };
+
+ uint32_t rs = 0;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ if (result != CL_SUCCESS) return 65536;
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ if (result != CL_SUCCESS) return 65537;
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ if (result != CL_SUCCESS) return 65538;
+
+ program = clCreateProgramWithSource(ctx, 1, &source, 0, &result);
+ if (result != CL_SUCCESS) return 65539;
+
+ result = clBuildProgram(program, 1, &device, "", 0, 0);
+ if (result != CL_SUCCESS)
+ {
+ // Print log
+ char *log = 0;
+ size_t len = 0;
+
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, 0, &len);
+ log = (char *)std::malloc(len);
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, log, 0);
+
+ std::cout << log << std::endl;
+ std::free(log);
+
+ return 65540;
+ }
+
+ kernel = clCreateKernel(program, "test_case", &result);
+ if (result != CL_SUCCESS) return 65541;
+
+ // Create the result buffer
+ rs_buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ sizeof(rs), &rs, &result);
+ if (result != CL_SUCCESS) return 65542;
+
+ result = clSetKernelArg(kernel, 0, sizeof(cl_mem), &rs_buf);
+ if (result != CL_SUCCESS) return 65543;
+
+ // Kind
+ switch (kind)
+ {
+ case NormalKind:
+ break;
+
+ case SamplerKind:
+ sampler = clCreateSampler(ctx, 1, CL_ADDRESS_MIRRORED_REPEAT, CL_FILTER_NEAREST, &result);
+ if (result != CL_SUCCESS) return 65546;
+
+ result = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
+ if (result != CL_SUCCESS) return 65547;
+ break;
+
+ case ImageKind:
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ fmt.image_channel_order = CL_RGBA;
+
+ mem1 = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, 4, 4, 0, 0, &result);
+ if (result != CL_SUCCESS) return 65548;
+
+ mem3 = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ &fmt, 3, 3, 0, image2d_data, &result);
+ if (result != CL_SUCCESS) return 65548;
+
+ fmt.image_channel_data_type = CL_SIGNED_INT16;
+
+ mem2 = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, 4, 4, 0, 0, &result);
+ if (result != CL_SUCCESS) return 65548;
+
+ result = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem1);
+ if (result != CL_SUCCESS) return 65549;
+
+ result = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem2);
+ if (result != CL_SUCCESS) return 65549;
+
+ result = clSetKernelArg(kernel, 3, sizeof(cl_mem), &mem3);
+ if (result != CL_SUCCESS) return 65549;
+ break;
+
+ default:
+ break;
+ }
+
+ if (kind == BarrierKind)
+ {
+ size_t local_size = 64;
+ size_t global_size = 64;
+
+ result = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size,
+ &local_size, 0, 0, &event);
+ if (result != CL_SUCCESS) return 65544;
+ }
+ else
+ {
+ result = clEnqueueTask(queue, kernel, 0, 0, &event);
+ if (result != CL_SUCCESS) return 65544;
+ }
+
+ result = clWaitForEvents(1, &event);
+ if (result != CL_SUCCESS) return 65545;
+
+ if (kind == SamplerKind) clReleaseSampler(sampler);
+ if (kind == ImageKind)
+ {
+ clReleaseMemObject(mem1);
+ clReleaseMemObject(mem2);
+ clReleaseMemObject(mem3);
+ }
+ clReleaseEvent(event);
+ clReleaseMemObject(rs_buf);
+ clReleaseKernel(kernel);
+ clReleaseProgram(program);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+
+ return rs;
+}
+
+static const char *default_error(uint32_t errcode)
+{
+ switch (errcode)
+ {
+ case 0:
+ return 0;
+ case 65536:
+ return "Cannot get a device ID";
+ case 65537:
+ return "Cannot create a context";
+ case 65538:
+ return "Cannot create a command queue";
+ case 65539:
+ return "Cannot create a program with given source";
+ case 65540:
+ return "Cannot build the program";
+ case 65541:
+ return "Cannot create the test_case kernel";
+ case 65542:
+ return "Cannot create a buffer holding a uint32_t";
+ case 65543:
+ return "Cannot set kernel argument";
+ case 65544:
+ return "Cannot enqueue the kernel";
+ case 65545:
+ return "Cannot wait for the event";
+ case 65546:
+ return "Cannot create a sampler";
+ case 65547:
+ return "Cannot set a sampler kernel argument";
+ case 65548:
+ return "Cannot create an Image2D object";
+ case 65549:
+ return "Cannot set image kernel argument";
+
+ default:
+ return "Unknown error code";
+ }
+}
+
+START_TEST (test_sampler)
+{
+ uint32_t rs = run_kernel(sampler_source, SamplerKind);
+ const char *errstr = 0;
+
+ switch (rs)
+ {
+ case 1:
+ errstr = "Sampler bitfield invalid";
+ break;
+ default:
+ errstr = default_error(rs);
+ }
+
+ fail_if(
+ errstr != 0,
+ errstr
+ );
+}
+END_TEST
+
+START_TEST (test_barrier)
+{
+ uint32_t rs = run_kernel(barrier_source, BarrierKind);
+
+ fail_if(
+ rs != 0x40,
+ default_error(rs)
+ );
+}
+END_TEST
+
+START_TEST (test_image)
+{
+ uint32_t rs = run_kernel(image_source, ImageKind);
+ const char *errstr = 0;
+
+ switch (rs)
+ {
+ case 1:
+ errstr = "Image1 must have width of 4";
+ break;
+ case 2:
+ errstr = "Image1 must have width of 4";
+ break;
+ case 3:
+ errstr = "Image2 must have type SIGNED_FLOAT16";
+ break;
+ case 4:
+ errstr = "Image2 must have channel order RGBA";
+ break;
+ case 5:
+ errstr = "The value read from the image is not good";
+ break;
+ default:
+ errstr = default_error(rs);
+ }
+
+ fail_if(
+ errstr != 0,
+ errstr
+ );
+}
+END_TEST
+
+START_TEST (test_builtins)
+{
+ uint32_t rs = run_kernel(builtins_source, NormalKind);
+ const char *errstr = 0;
+
+ switch (rs)
+ {
+ case 1:
+ errstr = "float2 cos(float2) doesn't behave correctly";
+ break;
+ case 2:
+ errstr = "float cos(float) doesn't behave correctly";
+ break;
+ case 3:
+ errstr = "float copysign(float) doesn't behave correctly";
+ break;
+ case 4:
+ errstr = "float2 copysign(float2) doesn't behave correctly";
+ break;
+ case 5:
+ errstr = "exp2() doesn't behave correctly";
+ break;
+ default:
+ errstr = default_error(rs);
+ }
+
+ fail_if(
+ errstr != 0,
+ errstr
+ );
+}
+END_TEST
+
+TCase *cl_builtins_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("builtins");
+ //tcase_add_test(tc, test_sampler);
+ tcase_add_test(tc, test_barrier);
+ //tcase_add_test(tc, test_image);
+ tcase_add_test(tc, test_builtins);
+ return tc;
+}
diff --git a/tests/test_builtins.h b/tests/test_builtins.h
new file mode 100644
index 0000000..a9480d5
--- /dev/null
+++ b/tests/test_builtins.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_BUILTINS__
+#define __UTEST_BUILTINS__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_builtins_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_commandqueue.cpp b/tests/test_commandqueue.cpp
new file mode 100644
index 0000000..0119ec5
--- /dev/null
+++ b/tests/test_commandqueue.cpp
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+
+#include "test_commandqueue.h"
+#include "CL/cl.h"
+
+#include <unistd.h>
+
+START_TEST (test_create_command_queue)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(0, device, 0, &result);
+ fail_if(
+ result != CL_INVALID_CONTEXT,
+ "context must be valid"
+ );
+
+ queue = clCreateCommandQueue(ctx, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_DEVICE,
+ "device cannot be NULL"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 1337, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1337 is not a valid value for properties"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_get_command_queue_info)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ union {
+ cl_context ctx;
+ cl_device_id device;
+ cl_uint refcount;
+ cl_command_queue_properties properties;
+ } info;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ result = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context),
+ (void *)&info, 0);
+ fail_if(
+ result != CL_SUCCESS || info.ctx != ctx,
+ "the queue doesn't retain its context"
+ );
+
+ result = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id),
+ (void *)&info, 0);
+ fail_if(
+ result != CL_SUCCESS || info.device != device,
+ "the queue doesn't retain its device"
+ );
+
+ result = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT, sizeof(cl_uint),
+ (void *)&info, 0);
+ fail_if(
+ result != CL_SUCCESS || info.refcount != 1,
+ "the queue must have a refcount of 1 when it's created"
+ );
+
+ result = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties),
+ (void *)&info, 0);
+ fail_if(
+ result != CL_SUCCESS || info.properties != 0,
+ "we gave no properties to the command queue"
+ );
+
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_object_tree)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_uint refcount;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint),
+ (void *)&refcount, 0);
+ fail_if(
+ result != CL_SUCCESS || refcount != 2,
+ "the queue must increment the refcount of its context"
+ );
+
+ clReleaseCommandQueue(queue);
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint),
+ (void *)&refcount, 0);
+ fail_if(
+ result != CL_SUCCESS || refcount != 1,
+ "the queue must decrement the refcount of its context when it's destroyed"
+ );
+
+ clReleaseContext(ctx);
+}
+END_TEST
+
+static void event_notify(cl_event event, cl_int exec_status, void *user_data)
+{
+ unsigned char *good = (unsigned char *)user_data;
+
+ *good = 1;
+}
+
+START_TEST (test_events)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_event user_event, write_event;
+ cl_mem buf;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ char s[] = "Original content";
+ unsigned char good = 0;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ user_event = clCreateUserEvent(0, &result);
+ fail_if(
+ result != CL_INVALID_CONTEXT,
+ "0 is not a valid context"
+ );
+
+ user_event = clCreateUserEvent(ctx, &result);
+ fail_if(
+ result != CL_SUCCESS || user_event == 0,
+ "cannot create an user event"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(s), s, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
+ );
+
+ // Queue a write buffer
+ result = clEnqueueWriteBuffer(queue, buf, 0, 0, 8, "Modified", 1,
+ &user_event, &write_event);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot enqueue an asynchronous write buffer command"
+ );
+
+ result = clSetEventCallback(write_event, CL_SUBMITTED, &event_notify, &good);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "callback_type must be CL_COMPLETE in OpenCL 1.1"
+ );
+
+ result = clSetEventCallback(write_event, CL_COMPLETE, &event_notify, &good);
+ fail_if(
+ result != CL_COMPLETE,
+ "cannot register an event callback"
+ );
+
+ sleep(1); // Let the worker threads a chance to do faulty things
+
+ fail_if(
+ good != 0 || strncmp(s, "Original content", sizeof(s)),
+ "at this time, nothing can have happened, the user event isn't complete"
+ );
+
+ // Now we can execute everything
+ result = clSetUserEventStatus(write_event, CL_COMPLETE);
+ fail_if(
+ result != CL_INVALID_EVENT,
+ "write_event is not an user event"
+ );
+
+ result = clSetUserEventStatus(user_event, CL_SUBMITTED);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "the execution status must be CL_COMPLETE"
+ );
+
+ result = clSetUserEventStatus(user_event, CL_COMPLETE);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot set the user event as completed"
+ );
+
+ // And wait (TODO: More careful checks of this function)
+ result = clWaitForEvents(1, &write_event);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot wait for events"
+ );
+
+ // Checks that all went good
+ fail_if(
+ good != 1,
+ "the callback function must be called when an event is completed"
+ );
+ fail_if(
+ strncmp(s, "Modified content", sizeof(s)),
+ "the buffer must contain \"Modified content\""
+ );
+
+ result = clSetUserEventStatus(user_event, CL_COMPLETE);
+ fail_if(
+ result != CL_INVALID_OPERATION,
+ "we cannot call clSetUserEventStatus two times for an event"
+ );
+
+ // Queue a map buffer
+ char *data;
+
+ data = (char *) clEnqueueMapBuffer(queue, buf, 1, CL_MAP_READ, 0, sizeof(s),
+ 0, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || !data || strncmp(data, s, sizeof(s)),
+ "unable to map a buffer containing what the buffer contains"
+ );
+
+ result = clEnqueueUnmapMemObject(queue, buf, data, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to unmap a mapped buffer"
+ );
+
+ // Get timing information about the event
+ cl_ulong timing_queued, timing_submit, timing_start, timing_end;
+
+ result = clGetEventProfilingInfo(write_event, CL_PROFILING_COMMAND_QUEUED,
+ sizeof(cl_ulong), &timing_queued, 0);
+ result |= clGetEventProfilingInfo(write_event, CL_PROFILING_COMMAND_SUBMIT,
+ sizeof(cl_ulong), &timing_submit, 0);
+ result |= clGetEventProfilingInfo(write_event, CL_PROFILING_COMMAND_START,
+ sizeof(cl_ulong), &timing_start, 0);
+ result |= clGetEventProfilingInfo(write_event, CL_PROFILING_COMMAND_END,
+ sizeof(cl_ulong), &timing_end, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get timing information about a profiling-enabled event"
+ );
+ fail_if(
+ !(timing_queued <= timing_submit &&
+ timing_submit <= timing_start &&
+ timing_start <= timing_end),
+ "something went wrong with the timings : they are unordered"
+ );
+
+ clReleaseEvent(write_event);
+ clReleaseEvent(user_event);
+ clReleaseMemObject(buf);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_read_write_rect)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_mem buf, buf_part;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ // Grid xyz = (5 x 7 x 2)
+ unsigned char grid[70] = {
+ 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 0,
+ 1, 2, 2, 2, 1,
+ 1, 2, 3, 2, 1,
+ 1, 2, 2, 2, 1,
+ 0, 1, 1, 1, 0,
+ 0, 0, 0, 0, 0,
+
+ 0, 0, 1, 0, 0,
+ 0, 0, 2, 0, 0,
+ 0, 1, 3, 1, 0,
+ 0, 2, 3, 2, 0,
+ 1, 3, 3, 3, 1,
+ 2, 3, 3, 3, 2,
+ 3, 3, 3, 3, 3
+ };
+
+ // Middle of the "image" : 3 x 3 x 2 centered at (3, 3)
+ unsigned char part[18] = {
+ 2, 2, 2,
+ 2, 3, 2,
+ 2, 2, 2,
+
+ 1, 3, 1,
+ 2, 3, 2,
+ 3, 3, 3
+ };
+
+ unsigned char buffer[70], buffer_part[18];
+ size_t host_origin[3] = {0, 0, 0};
+ size_t buf_origin[3] = {0, 0, 0};
+ size_t region[3] = {5, 7, 2};
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(buffer), buffer, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
+ );
+
+ buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(buffer_part), buffer_part, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a buffer for the part that will be read"
+ );
+
+ // Write grid into buffer
+ result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin,
+ region, 0, 0, 0, 0, grid, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot enqueue a blocking write buffer rect event with pitches guessed"
+ );
+ fail_if(
+ std::memcmp(buffer, grid, sizeof(buffer)) != 0,
+ "buffer doesn't contain the data"
+ );
+
+ // Read it back into a temporary region
+ buf_origin[0] = 1;
+ buf_origin[1] = 2;
+ buf_origin[2] = 0;
+ // host_origin remains (0, 0, 0)
+ region[0] = 3;
+ region[1] = 3;
+ region[2] = 2;
+
+ result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin,
+ region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a blocking write buffer rect event with host pitches guessed"
+ );
+ fail_if(
+ std::memcmp(buffer_part, part, sizeof(part)) != 0,
+ "the part of the buffer was not correctly read"
+ );
+
+ // Clear the temporary region and re-read into it using buf_part
+ std::memset(buffer_part, 0, sizeof(buffer_part));
+ cl_event event;
+
+ result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin,
+ host_origin, region, 5, 5*7, 0, 0, 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a copy buffer rect event"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to wait for the event"
+ );
+
+ fail_if(
+ std::memcmp(buffer_part, part, sizeof(part)) != 0,
+ "the part of the buffer was not correctly read using a buffer"
+ );
+
+ clReleaseEvent(event);
+ clReleaseMemObject(buf_part);
+ clReleaseMemObject(buf);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_copy_buffer)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_mem src_buf, dst_buf;
+ cl_event event;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ char src[] = "This is the data.";
+ char dst[] = "Overwrite this...";
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ src_buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(src), src, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create the source buffer"
+ );
+
+ dst_buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(dst), dst, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create the destination buffer"
+ );
+
+ result = clEnqueueCopyBuffer(queue, src_buf, dst_buf, 0, 0, sizeof(src),
+ 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a copy buffer event"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to wait for the event"
+ );
+
+ fail_if(
+ std::memcmp(src, dst, sizeof(src)) != 0,
+ "the buffer wasn't copied"
+ );
+
+ clReleaseEvent(event);
+ clReleaseMemObject(src_buf);
+ clReleaseMemObject(dst_buf);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_read_write_image)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_mem image2d, part2d;
+ cl_int result;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ unsigned char image2d_data_24bpp[3*3*4] = {
+ 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0,
+ 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0,
+ 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0
+ };
+
+ unsigned char image2d_part_24bpp[2*2*4] = {
+ 255, 0, 0, 0, 0, 255, 0, 0,
+ 0, 0, 255, 0, 255, 255, 0, 0
+ };
+
+ unsigned char image2d_buffer[3*3*4];
+ unsigned char image2d_part[2*2*4];
+
+ cl_image_format fmt;
+
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ fmt.image_channel_order = CL_RGBA;
+
+ size_t origin[3] = {0, 0, 0};
+ size_t region[3] = {3, 3, 1};
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 3, 3, 0, image2d_buffer, &result);
+ fail_if(
+ result != CL_SUCCESS || image2d == 0,
+ "cannot create a valid 3x3 image2D"
+ );
+
+ part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 2, 2, 0, image2d_part, &result);
+ fail_if(
+ result != CL_SUCCESS || image2d == 0,
+ "cannot create a valid 2x2 image2D"
+ );
+
+ // Write data in buffer
+ result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0,
+ image2d_data_24bpp, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot enqueue a blocking write image event"
+ );
+
+ // Read it back
+ region[0] = 2;
+ region[1] = 2;
+
+ result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0,
+ image2d_part, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot enqueue a blocking read image event"
+ );
+
+ // Compare
+ #if 0 // images not supported
+ fail_if(
+ std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
+ "reading and writing images doesn't produce the correct result"
+ );
+#endif
+
+ // Read it back using a buffer
+ cl_event event;
+ std::memset(image2d_part, 0, sizeof(image2d_part));
+
+ result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin,
+ region, 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue a copy image event"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to wait for events"
+ );
+
+ // Compare
+#if 0 // images not supported
+ fail_if(
+ std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0,
+ "copying images doesn't produce the correct result"
+ );
+#endif
+
+ clReleaseEvent(event);
+ clReleaseMemObject(part2d);
+ clReleaseMemObject(image2d);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_copy_image_buffer)
+{
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_mem image, buffer;
+ cl_int result;
+ cl_event event;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ unsigned char image_buffer[3*3*4] = {
+ 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0,
+ 128, 0, 0, 0, 0, 128, 0, 0, 0, 0, 128, 0,
+ 64, 0, 0, 0, 0, 64, 0, 0, 0, 0, 64, 0
+ };
+
+ // Square that will be put in image_buffer at (1, 0)
+ unsigned char buffer_buffer[2*2*4+1] = {
+ 33, // Oh, a padding !
+ 255, 255, 255, 0, 255, 0, 255, 0,
+ 0, 255, 255, 0, 255, 255, 0, 0
+ };
+
+ // What we must get once re-reading 2x2 rect at (1, 1)
+ unsigned char correct_data[2*2*4] = {
+ 0, 255, 255, 0, 255, 255, 0, 0,
+ 0, 64, 0, 0, 0, 0, 64, 0
+ };
+
+ cl_image_format fmt;
+
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ fmt.image_channel_order = CL_RGBA;
+
+ size_t origin[3] = {1, 0, 0};
+ size_t region[3] = {2, 2, 1};
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ image = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 3, 3, 0, image_buffer, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a 3x3 bgra image"
+ );
+
+ buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(buffer_buffer), buffer_buffer, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a buffer object"
+ );
+
+ // Write buffer in image
+ result = clEnqueueCopyBufferToImage(queue, buffer, image, 1, origin, region,
+ 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a copy buffer to image event, buffer offset 1, image 2x2 @ (1, 0)"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot wait for event"
+ );
+
+ clReleaseEvent(event);
+
+ // Read it back into buffer, again with an offset
+ origin[1] = 1;
+ result = clEnqueueCopyImageToBuffer(queue, image, buffer, origin, region, 1,
+ 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a copy image to buffer event, buffer offset 1, image 2x2 @ (1, 1)"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot wait for event"
+ );
+
+#if 0 // images not supported
+ fail_if(
+ std::memcmp(buffer_buffer + 1, correct_data, sizeof(correct_data)) != 0,
+ "copying data around isn't working the expected way"
+ );
+#endif
+
+
+ // Map the image and check pointers
+ unsigned char *mapped;
+ size_t row_pitch;
+
+ origin[0] = 0;
+ origin[1] = 0;
+ origin[2] = 0;
+
+ mapped = (unsigned char *)clEnqueueMapImage(queue, image, 1, CL_MAP_READ,
+ origin, region, &row_pitch, 0, 0,
+ 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to map an image"
+ );
+#if 0 // images not supported
+ fail_if(
+ mapped != image_buffer,
+ "mapped aread doesn't match host ptr"
+ );
+#endif
+
+ clReleaseEvent(event);
+ clReleaseMemObject(image);
+ clReleaseMemObject(buffer);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_misc_events)
+{
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_event uevent1, uevent2, marker1, marker2;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ /*
+ * This test will build a command queue blocked by an user event. The events
+ * will be in this order :
+ *
+ * -: UserEvent1
+ * 0: WaitForEvents1 (wait=UserEvent1)
+ * 1: Marker1
+ * -: UserEvent2
+ * 2: WaitForEvents2 (wait=UserEvent2)
+ * 3: Barrier
+ * 4: Marker2 (to check the barrier worked)
+ *
+ * When the command queue is built, we :
+ * - Check that Marker1 is Queued (WaitForEvents waits)
+ * - Set UserEvent1 to Complete
+ * - Check that Marker1 is Complete (WaitForEvents stopped to wait)
+ * - Check that Marker2 is Queued (Barrier is there)
+ * - Set UserEvent2 to Complete
+ * - Check that Marker2 is Complete (no more barrier)
+ */
+ uevent1 = clCreateUserEvent(ctx, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create UserEvent1"
+ );
+
+ uevent2 = clCreateUserEvent(ctx, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create UserEvent2"
+ );
+
+ result = clEnqueueWaitForEvents(queue, 1, &uevent1);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue WaitForEvents(UserEvent1)"
+ );
+
+ result = clEnqueueMarker(queue, &marker1);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue Marker1"
+ );
+
+ result = clEnqueueWaitForEvents(queue, 1, &uevent2);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue WaitForEvents(UserEvent2)"
+ );
+
+ result = clEnqueueBarrier(queue);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue Barrier"
+ );
+
+ result = clEnqueueMarker(queue, &marker2);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue Marker2"
+ );
+
+ // Now the checks
+ cl_int status;
+
+ result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
+ sizeof(cl_int), &status, 0);
+ fail_if(
+ result != CL_SUCCESS || status != CL_QUEUED,
+ "Marker1 must be Queued"
+ );
+
+ result = clSetUserEventStatus(uevent1, CL_COMPLETE);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to set UserEvent1 to Complete"
+ );
+
+ result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
+ sizeof(cl_int), &status, 0);
+ fail_if(
+ result != CL_SUCCESS || status != CL_COMPLETE,
+ "Marker1 must be Complete"
+ );
+
+ result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
+ sizeof(cl_int), &status, 0);
+ fail_if(
+ result != CL_SUCCESS || status != CL_QUEUED,
+ "Marker2 must be Queued"
+ );
+
+ result = clSetUserEventStatus(uevent2, CL_COMPLETE);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to set UserEvent2 to Complete"
+ );
+
+ result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
+ sizeof(cl_int), &status, 0);
+ fail_if(
+ result != CL_SUCCESS || status != CL_COMPLETE,
+ "Marker2 must be Complete"
+ );
+
+ clFinish(queue);
+
+ clReleaseEvent(uevent1);
+ clReleaseEvent(uevent2);
+ clReleaseEvent(marker1);
+ clReleaseEvent(marker2);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+TCase *cl_commandqueue_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("commandqueue");
+ tcase_add_test(tc, test_create_command_queue);
+ tcase_add_test(tc, test_get_command_queue_info);
+ tcase_add_test(tc, test_object_tree);
+ tcase_add_test(tc, test_events);
+ tcase_add_test(tc, test_read_write_rect);
+ tcase_add_test(tc, test_copy_buffer);
+#if 0 // Images not supported
+ tcase_add_test(tc, test_read_write_image);
+ tcase_add_test(tc, test_copy_image_buffer);
+#endif
+ tcase_add_test(tc, test_misc_events);
+ return tc;
+}
diff --git a/tests/test_commandqueue.h b/tests/test_commandqueue.h
new file mode 100644
index 0000000..0f2d22e
--- /dev/null
+++ b/tests/test_commandqueue.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_COMMANDQUEUE__
+#define __UTEST_COMMANDQUEUE__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_commandqueue_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_context.cpp b/tests/test_context.cpp
new file mode 100644
index 0000000..8ac044d
--- /dev/null
+++ b/tests/test_context.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_context.h"
+#include "CL/cl.h"
+
+START_TEST (test_create_context)
+{
+ cl_device_id device, wrong_device;
+ cl_int result;
+ cl_context ctx;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ struct __attribute__((packed)) {
+ cl_context_properties prop_platform;
+ cl_platform_id platform;
+ cl_context_properties null;
+ } _properties;
+
+ const cl_context_properties *properties =
+ (const cl_context_properties *)&_properties;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get a device"
+ );
+
+ _properties.prop_platform = CL_CONTEXT_PLATFORM;
+ _properties.null = 0;
+
+ ctx = clCreateContext(properties, 1, 0, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE || ctx != 0,
+ "devices cannot be NULL"
+ );
+
+ ctx = clCreateContext(properties, 0, &device, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE || ctx != 0,
+ "num_devices cannot be 0"
+ );
+
+ _properties.platform = (cl_platform_id)1337;
+
+ ctx = clCreateContext(properties, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_PLATFORM || ctx != 0,
+ "1337 is not a valid platform"
+ );
+
+ _properties.platform = platform;
+ _properties.prop_platform = 1337;
+
+ ctx = clCreateContext(properties, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_PROPERTY || ctx != 0,
+ "1337 is not a valid cl_context_properties"
+ );
+
+ _properties.prop_platform = CL_CONTEXT_PLATFORM;
+
+ ctx = clCreateContext(properties, 1, &device, 0, (void *)&device, &result);
+ fail_if(
+ result != CL_INVALID_VALUE || ctx != 0,
+ "user_data must be NULL if pfn_notify is NULL"
+ );
+
+ wrong_device = 0;
+
+ ctx = clCreateContext(properties, 1, &wrong_device, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_DEVICE || ctx != 0,
+ "0 is not a valid device"
+ );
+
+ ctx = clCreateContext(properties, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ clReleaseContext(ctx);
+
+ ctx = clCreateContext(properties, 1, &device, 0, 0, 0);
+ fail_if(
+ ctx == 0,
+ "errcode_ret can be NULL"
+ );
+
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_create_context_from_type)
+{
+ cl_context ctx;
+ cl_int result;
+
+ ctx = clCreateContextFromType(0, CL_DEVICE_TYPE_DEFAULT, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context with a device of type default"
+ );
+
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_get_context_info)
+{
+ cl_context ctx;
+ cl_int result;
+ size_t size_ret;
+
+ union {
+ cl_uint refcount, num_devices;
+ cl_device_id device;
+ struct __attribute__((packed)) {
+ cl_context_properties prop_platform;
+ cl_platform_id platform;
+ cl_context_properties null;
+ } properties;
+ } context_info;
+
+ const cl_context_properties *properties =
+ (const cl_context_properties *)&context_info.properties;
+
+ // Test for a dummy context
+ ctx = clCreateContextFromType(0, CL_DEVICE_TYPE_DEFAULT, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context with a device of type default"
+ );
+
+ result = clGetContextInfo(0, CL_CONTEXT_REFERENCE_COUNT, 0, 0, &size_ret);
+ fail_if(
+ result != CL_INVALID_CONTEXT,
+ "0 is not a valid context"
+ );
+
+ result = clGetContextInfo(ctx, 1337, 0, 0, &size_ret);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1337 is not a valid param_name"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, 0, &context_info,
+ &size_ret);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "param_value_size is too small to contain a cl_uint"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, 0, 0, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(cl_uint),
+ "we must succeed and say that we'll return a cl_uint"
+ );
+
+ // Use a real context and check the return values
+ clReleaseContext(ctx);
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ context_info.properties.prop_platform = CL_CONTEXT_PLATFORM;
+ context_info.properties.platform = platform;
+ context_info.properties.null = 0;
+
+ ctx = clCreateContextFromType(properties, CL_DEVICE_TYPE_DEFAULT, 0, 0,
+ &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context with a device of type default"
+ );
+
+ // This call clobbers context_info.properties, so we also check that
+ // they are properly std::memcpy'ed by Coal::Context.
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint),
+ &context_info, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || context_info.refcount != 1,
+ "context's reference count must be 1 here"
+ );
+
+ clRetainContext(ctx);
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint),
+ &context_info, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(cl_uint) ||
+ context_info.refcount != 2,
+ "context's reference count must be 2 here"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint),
+ &context_info, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(cl_uint) ||
+ context_info.num_devices != 1,
+ "we currently support only one device : CPU"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(cl_device_id),
+ &context_info, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(cl_device_id) ||
+ context_info.device == 0,
+ "this context must have a device"
+ );
+
+ result = clGetContextInfo(ctx, CL_CONTEXT_PROPERTIES,
+ sizeof(context_info.properties), &context_info,
+ &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(context_info.properties) ||
+ context_info.properties.prop_platform != CL_CONTEXT_PLATFORM,
+ "this context must have a valid CL_CONTEXT_PLATFORM property"
+ );
+
+ clReleaseContext(ctx);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+TCase *cl_context_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("context");
+ tcase_add_test(tc, test_create_context);
+ tcase_add_test(tc, test_create_context_from_type);
+ tcase_add_test(tc, test_get_context_info);
+ return tc;
+}
diff --git a/tests/test_context.h b/tests/test_context.h
new file mode 100644
index 0000000..701db41
--- /dev/null
+++ b/tests/test_context.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_CONTEXT__
+#define __UTEST_CONTEXT__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_context_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_device.cpp b/tests/test_device.cpp
new file mode 100644
index 0000000..e86a7e2
--- /dev/null
+++ b/tests/test_device.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_device.h"
+#include "CL/cl.h"
+
+START_TEST (test_get_device_ids)
+{
+ cl_device_id device;
+ cl_uint num_devices;
+ cl_int result;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 0, &device, &num_devices);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "num_entries cannot be NULL when devices is not null"
+ );
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 0, 0, 0);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "num_devices and devices cannot be NULL at the same time"
+ );
+
+ result = clGetDeviceIDs((cl_platform_id)1337, CL_DEVICE_TYPE_DEFAULT, 1, &device, &num_devices);
+ fail_if(
+ result != CL_INVALID_PLATFORM,
+ "1337 is not a valid platform"
+ );
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
+ fail_if(
+ result != CL_DEVICE_NOT_FOUND,
+ "there are no GPU devices currently available"
+ );
+
+#ifdef SHAMROCK_BUILD
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, &num_devices);
+ fail_if(
+ result != CL_DEVICE_NOT_FOUND,
+ "there are no ACCELERATOR devices currently available"
+ );
+#endif
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, 0, &num_devices);
+ fail_if(
+ result != CL_SUCCESS || num_devices != 1,
+ "we must succeed and say that we have one device"
+ );
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, &num_devices);
+ fail_if(
+ result != CL_SUCCESS || num_devices != 1 || device == 0,
+ "we must succeed and have one device"
+ );
+}
+END_TEST
+
+START_TEST (test_get_device_info)
+{
+ cl_device_id device;
+ cl_uint num_devices;
+ cl_int result;
+
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ clGetPlatformIDs(1, &platform, &num_platforms);
+
+ size_t size_ret;
+ char value[500];
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, &num_devices);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get a device"
+ );
+
+ result = clGetDeviceInfo(0, CL_DEVICE_TYPE, 500, value, &size_ret);
+ fail_if(
+ result != CL_INVALID_DEVICE,
+ "0 is not a valid device"
+ );
+
+ result = clGetDeviceInfo(device, 13376334, 500, value, &size_ret);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "13376334 is not a valid param_name"
+ );
+
+ result = clGetDeviceInfo(device, CL_DEVICE_TYPE, 1, value, &size_ret);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1 is too small to contain a cl_device_type"
+ );
+
+ result = clGetDeviceInfo(device, CL_DEVICE_TYPE, 0, 0, &size_ret);
+ fail_if(
+ result != CL_SUCCESS || size_ret != sizeof(cl_device_type),
+ "we have to succeed and to say that the result is a cl_device_type"
+ );
+
+ result = clGetDeviceInfo(device, CL_DEVICE_TYPE, 500, value, &size_ret);
+#if SHAMROCK_BUILD
+ fail_if(
+ result != CL_SUCCESS || *(cl_device_type*)(value) != CL_DEVICE_TYPE_CPU,
+ "we have to say the device is a CPU"
+ );
+#else
+ fail_if(
+ result != CL_SUCCESS || *(cl_device_type*)(value) != CL_DEVICE_TYPE_ACCELERATOR,
+ "we have to say the device is a ACCELERATOR"
+ );
+#endif
+
+ result = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 500, value, &size_ret);
+ fail_if(
+ result != CL_SUCCESS,
+ "we must succeed"
+ );
+#ifdef SHAMROCK_BUILD
+ fail_if(
+ strncmp(value, "Generic", size_ret) != 0,
+ "the device vendor must be \"Generic\""
+ );
+#else
+ fail_if(
+ strncmp(value, "Texas Instruments, Inc.", size_ret) != 0,
+ "the device vendor must be \"Texas Instruments, Inc.\""
+ );
+#endif
+}
+END_TEST
+
+TCase *cl_device_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("device");
+ tcase_add_test(tc, test_get_device_ids);
+ tcase_add_test(tc, test_get_device_info);
+ return tc;
+}
diff --git a/tests/test_device.h b/tests/test_device.h
new file mode 100644
index 0000000..2aab3a8
--- /dev/null
+++ b/tests/test_device.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_DEVICE__
+#define __UTEST_DEVICE__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_device_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_kernel.cpp b/tests/test_kernel.cpp
new file mode 100644
index 0000000..bbb8d28
--- /dev/null
+++ b/tests/test_kernel.cpp
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <iostream>
+
+#include "test_kernel.h"
+#include "CL/cl.h"
+
+static const char source[] =
+ "float simple_function(float a) {\n"
+ " return a * 2.5f;\n"
+ "}\n"
+ "\n"
+ "__kernel void kernel1(__global float *a, __global float *b, float f) {\n"
+ " size_t i = get_global_id(0);\n"
+ "\n"
+ " a[i] = simple_function(f) * b[i];\n"
+ "}\n"
+ "\n"
+ "__kernel void kernel2(__global unsigned int *buf) {\n"
+ " size_t i = get_global_id(0);\n"
+ "\n"
+ " buf[i % 256] = 2 * (i % 256);\n"
+ "}\n";
+
+static void native_kernel(void *args)
+{
+ struct ags
+ {
+ size_t buffer_size;
+ char *buffer;
+ };
+
+ struct ags *data = (struct ags *)args;
+
+ // Not
+ for (size_t i=0; i<data->buffer_size; ++i)
+ {
+ data->buffer[i] = ~data->buffer[i];
+ }
+}
+
+START_TEST (test_compiled_kernel)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_program program;
+ cl_int result;
+ cl_kernel kernels[2];
+ cl_uint num_kernels;
+ cl_mem buf;
+
+ const char *src = source;
+ size_t program_len = sizeof(source);
+
+ unsigned int buffer[256];
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a command queue"
+ );
+
+ program = clCreateProgramWithSource(ctx, 1, &src, &program_len, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a program from source with sane arguments"
+ );
+
+ result = clBuildProgram(program, 1, &device, "", 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot build a valid program"
+ );
+
+ kernels[0] = clCreateKernel(program, "simple_function", &result);
+ fail_if(
+ result != CL_INVALID_KERNEL_NAME,
+ "simple_function is not a kernel"
+ );
+
+ kernels[0] = clCreateKernel(program, "kernel1", &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a valid kernel"
+ );
+
+ clReleaseKernel(kernels[0]); // Just born and already killed...
+
+ result = clCreateKernelsInProgram(program, 0, 0, &num_kernels);
+ fail_if(
+ result != CL_SUCCESS || num_kernels != 2,
+ "unable to get the number of kernels"
+ );
+
+ result = clCreateKernelsInProgram(program, 2, kernels, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the two kernels of the program"
+ );
+
+ // Try to run kernel2
+ buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ sizeof(buffer), buffer, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_COPY_HOST_PTR read-write buffer"
+ );
+
+ result = clSetKernelArg(kernels[1], 0, sizeof(cl_mem), &buf);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot set kernel argument"
+ );
+
+ size_t local_size = sizeof(buffer) / sizeof(buffer[0]);
+ size_t global_size = 100000 * local_size;
+ cl_event event;
+ bool ok;
+
+ result = clEnqueueNDRangeKernel(queue, kernels[1], 1, 0, &global_size, 0, 0, 0, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to queue a NDRange kernel with local work size guessed"
+ );
+
+ result = clWaitForEvents(1, &event);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to wait for event"
+ );
+
+ ok = true;
+ for (size_t i=0; i<local_size; ++i)
+ {
+ if (buffer[i] != 2 * i)
+ {
+ ok = false;
+ break;
+ }
+ }
+
+ fail_if(
+ ok == false,
+ "the kernel hasn't done its job, the buffer is wrong"
+ );
+
+ clReleaseKernel(kernels[0]);
+ clReleaseKernel(kernels[1]);
+ clReleaseProgram(program);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_native_kernel)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_int result;
+ cl_event events[2];
+ cl_mem buf1, buf2;
+
+ char s1[] = "Lorem ipsum dolor sit amet";
+ char s2[] = "I want to tell you that you rock";
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device,
+ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ buf1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ sizeof(s1), (void *)&s1, &result);
+ fail_if(
+ result != CL_SUCCESS || buf1 == 0,
+ "cannot create a buffer"
+ );
+
+ buf2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ sizeof(s2), (void *)&s2, &result);
+ fail_if(
+ result != CL_SUCCESS || buf2 == 0,
+ "cannot create a buffer"
+ );
+
+ struct
+ {
+ size_t buffer_size;
+ char *buffer;
+ } args;
+
+ args.buffer_size = sizeof(s1);
+ args.buffer = 0;
+
+ const void *mem_loc = (const void *)&args.buffer;
+
+ result = clEnqueueNativeKernel(queue, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "user_func cannot be NULL"
+ );
+
+ result = clEnqueueNativeKernel(queue, &native_kernel, 0, sizeof(args),
+ 1, &buf1, &mem_loc, 0, 0,
+ &events[0]);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "args cannot be NULL when cb_args != 0"
+ );
+
+ result = clEnqueueNativeKernel(queue, &native_kernel, &args, sizeof(args),
+ 1, 0, &mem_loc, 0, 0,
+ &events[0]);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "mem_list cannot be NULL when num_mem_objects != 0"
+ );
+
+ result = clEnqueueNativeKernel(queue, &native_kernel, &args, sizeof(args),
+ 1, &buf1, 0, 0, 0, &events[0]);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "args_mem_loc cannot be NULL when num_mem_objects != 0"
+ );
+
+ result = clEnqueueNativeKernel(queue, &native_kernel, &args, sizeof(args),
+ 1, &buf1, &mem_loc, 0, 0,
+ &events[0]);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue native kernel nr 1"
+ );
+
+ args.buffer_size = sizeof(s2);
+
+ result = clEnqueueNativeKernel(queue, &native_kernel, &args, sizeof(args),
+ 1, &buf2, &mem_loc, 0, 0,
+ &events[1]);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to enqueue native kernel nr 2"
+ );
+
+ // Wait for events
+ result = clWaitForEvents(2, events);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to wait for events"
+ );
+
+ fail_if(
+ s1[0] != (char)~'L' || s2[0] != (char)~'I',
+ "the native kernel hasn't done its job"
+ );
+
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+TCase *cl_kernel_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("kernel");
+ tcase_add_test(tc, test_native_kernel);
+ tcase_add_test(tc, test_compiled_kernel);
+ return tc;
+}
diff --git a/tests/test_kernel.h b/tests/test_kernel.h
new file mode 100644
index 0000000..333221a
--- /dev/null
+++ b/tests/test_kernel.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_KERNEL__
+#define __UTEST_KERNEL__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_kernel_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_mem.cpp b/tests/test_mem.cpp
new file mode 100644
index 0000000..a4fdfdb
--- /dev/null
+++ b/tests/test_mem.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <iostream>
+
+#include "test_mem.h"
+#include "CL/cl.h"
+
+START_TEST (test_create_buffer)
+{
+ cl_context ctx;
+ cl_mem buf;
+ cl_int result;
+ char s[] = "Hello, world !";
+
+ ctx = clCreateContextFromType(0, CL_DEVICE_TYPE_DEFAULT, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a valid context"
+ );
+
+ buf = clCreateBuffer(0, CL_MEM_READ_WRITE, sizeof(s), 0, &result);
+ fail_if(
+ result != CL_INVALID_CONTEXT,
+ "0 is not a valid context"
+ );
+
+ buf = clCreateBuffer(ctx, 1337, sizeof(s), 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1337 is not a valid cl_mem_flags"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sizeof(s), 0, &result);
+ fail_if(
+ result != CL_INVALID_HOST_PTR,
+ "host_ptr cannot be NULL if flags is CL_MEM_USE_HOST_PTR"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, sizeof(s), 0, &result);
+ fail_if(
+ result != CL_INVALID_HOST_PTR,
+ "host_ptr cannot be NULL if flags is CL_MEM_COPY_HOST_PTR"
+ );
+
+ buf = clCreateBuffer(ctx, 0, sizeof(s), s, &result);
+ fail_if(
+ result != CL_INVALID_HOST_PTR,
+ "host_ptr must be NULL if flags is not CL_MEM_{COPY/USE}_HOST_PTR"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, 0, s, &result);
+ fail_if(
+ result != CL_INVALID_BUFFER_SIZE,
+ "size cannot be 0"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ sizeof(s), s, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_COPY_HOST_PTR read-write buffer"
+ );
+
+ clReleaseMemObject(buf);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_create_sub_buffer)
+{
+ cl_context ctx;
+ cl_mem buf, subbuf;
+ cl_int result;
+ char s[] = "Hello, world !";
+
+ cl_buffer_region create_info; // "Hello, [world] !"
+
+ create_info.origin = 7;
+ create_info.size = 5;
+
+ ctx = clCreateContextFromType(0, CL_DEVICE_TYPE_DEFAULT, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a valid context"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(s), s, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
+ );
+
+ subbuf = clCreateSubBuffer(0, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_INVALID_MEM_OBJECT,
+ "0 is not a valid mem object"
+ );
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_READ_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "READ_ONLY is not compatible with WRITE_ONLY"
+ );
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_WRITE_ONLY, 1337, (void *)&create_info,
+ &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1337 is not a valid buffer_create_type"
+ );
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION, 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "buffer_create_info cannot be NULL"
+ );
+
+ create_info.size = 0;
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_INVALID_BUFFER_SIZE,
+ "create_info.size cannot be 0"
+ );
+
+ create_info.size = 5;
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_SUCCESS || subbuf == 0,
+ "cannot create a valid sub-buffer"
+ );
+
+ clCreateSubBuffer(subbuf, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_INVALID_MEM_OBJECT,
+ "we cannot create a sub-buffer of a sub-buffer"
+ );
+
+ clReleaseMemObject(subbuf);
+ clReleaseMemObject(buf);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_read_write_subbuf)
+{
+ cl_context ctx;
+ cl_mem buf, subbuf;
+ cl_command_queue queue;
+ cl_device_id device;
+ cl_int result;
+ char s[] = "Hello, Denis !";
+
+ cl_buffer_region create_info;
+
+ create_info.origin = 7; // "Hello, [denis] !"
+ create_info.size = 5;
+
+ result = clGetDeviceIDs(0, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot get a device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a valid context"
+ );
+
+ queue = clCreateCommandQueue(ctx, device, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || queue == 0,
+ "cannot create a command queue"
+ );
+
+ buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ sizeof(s), s, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer"
+ );
+
+ subbuf = clCreateSubBuffer(buf, CL_MEM_WRITE_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ (void *)&create_info, &result);
+ fail_if(
+ result != CL_SUCCESS || subbuf == 0,
+ "cannot create a valid sub-buffer"
+ );
+
+ ////
+ char *hostptr;
+ char *valid_hostptr = s;
+
+ valid_hostptr += create_info.origin;
+
+ result = clGetMemObjectInfo(subbuf, CL_MEM_HOST_PTR, sizeof(char *),
+ (void *)&hostptr, 0);
+ fail_if(
+ result != CL_SUCCESS || hostptr != valid_hostptr,
+ "the host ptr of a subbuffer must point to a subportion of its parent buffer"
+ );
+
+ result = clEnqueueWriteBuffer(queue, subbuf, 1, 0, 5, "world", 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to write to the sub buffer"
+ );
+
+ char data[16];
+
+ result = clEnqueueReadBuffer(queue, subbuf, 1, 0, 5, data, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to read the sub buffer"
+ );
+ fail_if(
+ strncmp(data, "world", 5),
+ "the subbuffer must contain \"world\""
+ );
+
+ result = clEnqueueReadBuffer(queue, buf, 1, 0, sizeof(s), data, 0, 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to read the buffer"
+ );
+ fail_if(
+ strncmp(data, "Hello, world !", sizeof(s)),
+ "the buffer must contain \"Hello, world !\""
+ );
+
+ clReleaseCommandQueue(queue);
+ clReleaseMemObject(subbuf);
+ clReleaseMemObject(buf);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_images)
+{
+ cl_context ctx;
+ cl_mem image2d, image3d;
+ cl_int result;
+
+ unsigned char image2d_data_24bpp[] = {
+ 255, 0, 0, 0, 0, 255, 0, 0,
+ 0, 0, 255, 0, 255, 255, 0, 0
+ };
+
+ unsigned char image3d_data_24bpp[] = {
+ 255, 0, 0, 0, 0, 255, 0, 0,
+ 0, 0, 255, 0, 255, 255, 0, 0,
+
+ 128, 0, 0, 0, 0, 128, 0, 0,
+ 0, 0, 128, 0, 128, 128, 0, 0
+ };
+
+ cl_image_format fmt;
+
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ fmt.image_channel_order = CL_RGBA;
+
+ ctx = clCreateContextFromType(0, CL_DEVICE_TYPE_DEFAULT, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to create a valid context"
+ );
+
+ image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 2, 2, 7, image2d_data_24bpp, &result);
+ fail_if(
+ result != CL_INVALID_IMAGE_SIZE,
+ "7 is not a valid row pitch for 24bpp, it isn't divisible by 3"
+ );
+
+ image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 2, 2, 0, image2d_data_24bpp, &result);
+ fail_if(
+ result != CL_SUCCESS || image2d == 0,
+ "cannot create a valid 2x2 image2D"
+ );
+
+ image3d = clCreateImage3D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt,
+ 2, 2, 2, 0, 0, image3d_data_24bpp, &result);
+ fail_if(
+ result != CL_SUCCESS || image3d == 0,
+ "cannot create a valid 2x2x2 image3D"
+ );
+
+ clReleaseMemObject(image3d);
+ clReleaseMemObject(image2d);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+TCase *cl_mem_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("mem");
+ tcase_add_test(tc, test_create_buffer);
+#if 0 // subbuffer tests need to be rewritten, they assume subbuffer alignment of 0!
+ tcase_add_test(tc, test_create_sub_buffer);
+ tcase_add_test(tc, test_read_write_subbuf);
+#endif
+ tcase_add_test(tc, test_images);
+ return tc;
+}
diff --git a/tests/test_mem.h b/tests/test_mem.h
new file mode 100644
index 0000000..342b9b2
--- /dev/null
+++ b/tests/test_mem.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_MEM__
+#define __UTEST_MEM__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_mem_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_platform.cpp b/tests/test_platform.cpp
new file mode 100644
index 0000000..516c652
--- /dev/null
+++ b/tests/test_platform.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_platform.h"
+
+#include "CL/cl.h"
+
+START_TEST (test_get_platform_ids)
+{
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ cl_int result = 0;
+
+ result = clGetPlatformIDs(0, &platform, &num_platforms);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "num_entries cannot be NULL when *platforms is provided"
+ );
+
+ result = clGetPlatformIDs(0, 0, 0);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "Both num_platforms and *platforms cannot be NULL at the same time"
+ );
+
+ result = clGetPlatformIDs(1, 0, &num_platforms);
+ fail_if(
+ result != CL_SUCCESS || num_platforms == 0,
+ "When *platforms is NULL, success and put the number of platforms in num_platforms"
+ );
+
+ result = clGetPlatformIDs(1, &platform, &num_platforms);
+ fail_if(
+ result != CL_SUCCESS,
+ "It's bad to fail when the function is used in the most common sense"
+ );
+}
+END_TEST
+
+START_TEST (test_get_platform_info)
+{
+ cl_platform_id platform = 0;
+ cl_uint num_platforms = 0;
+ cl_int result = 0;
+ char *buf[100];
+ size_t buf_len = 0;
+
+ result = clGetPlatformIDs(1, &platform, &num_platforms);
+ fail_if(
+ result != CL_SUCCESS,
+ "It's bad to fail when the function is used in the most common sense"
+ );
+
+ result = clGetPlatformInfo((_cl_platform_id *) -1, CL_PLATFORM_PROFILE, sizeof(buf), buf, &buf_len);
+ fail_if(
+ result != CL_INVALID_PLATFORM,
+ "-1 is not a valid platform"
+ );
+
+ result = clGetPlatformInfo(platform, 1337, sizeof(buf), buf, &buf_len);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "1337 is not a valid param_name"
+ );
+
+ result = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, 0, buf, &buf_len);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "param_value_size cannot be NULL when a buffer is supplied"
+ );
+
+ result = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, 0, 0, &buf_len);
+ fail_if(
+ result != CL_SUCCESS || buf_len == 0,
+ "We are allowed not to pass a buffer. The function must fill param_value_size_ret"
+ );
+
+ result = clGetPlatformInfo(platform, CL_PLATFORM_PROFILE, sizeof(buf), buf, &buf_len);
+ fail_if(
+ result != CL_SUCCESS || buf_len == 0,
+ "It's bad to fail when the function is used in the most common sense"
+ );
+}
+END_TEST
+
+TCase *cl_platform_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("platform");
+ tcase_add_test(tc, test_get_platform_ids);
+ tcase_add_test(tc, test_get_platform_info);
+ return tc;
+}
diff --git a/tests/test_platform.h b/tests/test_platform.h
new file mode 100644
index 0000000..96b61ce
--- /dev/null
+++ b/tests/test_platform.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_PLATFORM__
+#define __UTEST_PLATFORM__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_platform_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/test_program.cpp b/tests/test_program.cpp
new file mode 100644
index 0000000..80215a9
--- /dev/null
+++ b/tests/test_program.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_program.h"
+#include "CL/cl.h"
+
+#include <cstdlib>
+#include <cstring>
+
+const char program_source[] =
+ "#warning We need that line\n"
+ "\n"
+ "__kernel void test(__global float4 *a, __global float4 *b) {\n"
+ " int i = get_global_id(0);\n"
+ "\n"
+ " a[i].xwyz = 3.1415926f * b[0].xyzw * b[0].wzyx;\n"
+ "}\n";
+
+START_TEST (test_create_program)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_program program;
+ cl_int result;
+
+ const char *src = program_source;
+ size_t program_len = sizeof(program_source);
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ program = clCreateProgramWithSource(ctx, 0, &src, 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "count cannot be 0"
+ );
+
+ program = clCreateProgramWithSource(ctx, 1, 0, 0, &result);
+ fail_if(
+ result != CL_INVALID_VALUE,
+ "strings cannot be NULL"
+ );
+
+ program = clCreateProgramWithSource(ctx, 1, &src, &program_len,
+ &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a program from source with sane arguments"
+ );
+
+ clReleaseProgram(program); // Sorry
+
+ program = clCreateProgramWithSource(ctx, 1, &src, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "lengths can be NULL and it must work"
+ );
+
+ result = clBuildProgram(program, 1, &device, "", 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot build a valid program"
+ );
+
+ clReleaseProgram(program);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_program_binary)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_program program;
+ cl_int result, binary_status;
+
+ const char *src = program_source;
+ size_t program_len = sizeof(program_source);
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ program = clCreateProgramWithSource(ctx, 1, &src, &program_len, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot create a program from source with sane arguments"
+ );
+
+ result = clBuildProgram(program, 1, &device, "", 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot build a valid program"
+ );
+
+ size_t binary_size = 0;
+ unsigned char *binary_data = 0;
+
+ result = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t),
+ (void *)&binary_size, 0);
+ fail_if(
+ result != CL_SUCCESS || binary_size == 0,
+ "cannot get the binary size of the program"
+ );
+
+ binary_data = (unsigned char *)std::malloc(binary_size);
+
+ result = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *),
+ (void *)&binary_data, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot get the program's binary"
+ );
+
+ clReleaseProgram(program);
+
+ program = clCreateProgramWithBinary(ctx, 1, &device, &binary_size,
+ (const unsigned char **)&binary_data,
+ &binary_status, &result);
+ fail_if(
+ result != CL_SUCCESS || binary_status != CL_SUCCESS,
+ "cannot create a program from a previously-built binary"
+ );
+
+ clReleaseProgram(program);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+START_TEST (test_program_build_info)
+{
+ cl_platform_id platform = 0;
+ cl_device_id device;
+ cl_context ctx;
+ cl_program program;
+ cl_int result;
+
+ const char *src = program_source;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "unable to get the default device"
+ );
+
+ ctx = clCreateContext(0, 1, &device, 0, 0, &result);
+ fail_if(
+ result != CL_SUCCESS || ctx == 0,
+ "unable to create a valid context"
+ );
+
+ program = clCreateProgramWithSource(ctx, 1, &src, 0, &result);
+ fail_if(
+ result != CL_SUCCESS,
+ "lengths can be NULL and it must work"
+ );
+
+ result = clBuildProgram(program, 1, &device, "", 0, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot build a valid program"
+ );
+
+ char *log = 0;
+ size_t log_len;
+
+ result = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+ 0, 0, &log_len);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot get the build log len"
+ );
+ fail_if(
+ log_len == 0,
+ "we put a warning in the source, log cannot be of lentgth 0"
+ );
+
+ log = (char *)std::malloc(log_len);
+
+ result = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+ log_len, log, 0);
+ fail_if(
+ result != CL_SUCCESS,
+ "cannot get the build log"
+ );
+ fail_if(
+ !std::strstr(log, "We need that line"),
+ "the build log doesn't contain the warning found in the source"
+ );
+
+ clReleaseProgram(program);
+ clReleaseContext(ctx);
+}
+END_TEST
+
+TCase *cl_program_tcase_create(void)
+{
+ TCase *tc = NULL;
+ tc = tcase_create("program");
+ tcase_add_test(tc, test_create_program);
+ tcase_add_test(tc, test_program_binary);
+ tcase_add_test(tc, test_program_build_info);
+ return tc;
+}
diff --git a/tests/test_program.h b/tests/test_program.h
new file mode 100644
index 0000000..9c00039
--- /dev/null
+++ b/tests/test_program.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __UTEST_PROGRAM__
+#define __UTEST_PROGRAM__
+
+#include <check.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TCase *cl_program_tcase_create(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/tests/tests.c b/tests/tests.c
new file mode 100644
index 0000000..4db057b
--- /dev/null
+++ b/tests/tests.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the copyright holder nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "test_platform.h"
+#include "test_device.h"
+#include "test_context.h"
+#include "test_commandqueue.h"
+#include "test_mem.h"
+#include "test_kernel.h"
+#include "test_program.h"
+#include "test_builtins.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+ int n_failed_tests;
+ Suite *s = NULL;
+
+ if (argc < 2) {
+ printf("test <test> [nofork]\n");
+ return EXIT_FAILURE;
+ }
+
+#define TESTSUITE(name, string) \
+ if (!strcmp(string, argv[1])) { \
+ s = suite_create(string); \
+ suite_add_tcase(s, cl_##name##_tcase_create()); \
+ }
+
+ TESTSUITE(platform, "platform");
+ TESTSUITE(device, "device");
+ TESTSUITE(context, "context");
+ TESTSUITE(commandqueue, "commandqueue");
+ TESTSUITE(mem, "mem");
+ TESTSUITE(kernel, "kernel");
+ TESTSUITE(program, "program");
+ TESTSUITE(builtins, "builtins");
+
+ if (s == NULL) {
+ printf("test case %s does not exist\n", argv[1]);
+ return EXIT_FAILURE;
+ }
+
+ SRunner *sr = srunner_create(s);
+
+ if (argc == 3 && !strcmp("nofork", argv[2]))
+ srunner_set_fork_status (sr, CK_NOFORK);
+
+ srunner_run_all(sr, CK_NORMAL);
+
+ n_failed_tests = srunner_ntests_failed(sr);
+ srunner_free(sr);
+
+ return (n_failed_tests == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/vector_args.bc b/tests/vector_args.bc
new file mode 100644
index 0000000..b2d6578
--- /dev/null
+++ b/tests/vector_args.bc
Binary files differ
diff --git a/tests/vector_args.cl b/tests/vector_args.cl
new file mode 100644
index 0000000..f8b80c0
--- /dev/null
+++ b/tests/vector_args.cl
@@ -0,0 +1,38 @@
+// RUN:
+// % clang -c -emit-llvm -x cl -O2 -nostdinc -fno-builtin -I/usr/include vector_args.cl -o vector_args.bc
+// % llvm-dis vector_args.bc; more vector_args.ll
+
+
+#include <CL/clc.h>
+
+/*
+__kernel void test_kernel%s(char%s c, uchar%s uc, short%s s, ushort%s us, int%s i, uint%s ui, float%s f,
+ __global float%s *result)
+{
+}
+*/
+
+__kernel void test_kernel(char c, uchar uc, short s, ushort us, int i, uint ui, float f,
+ __global float *result)
+{
+}
+
+__kernel void test_kernel2(char2 c, uchar2 uc, short2 s, ushort2 us, int2 i, uint2 ui, float2 f,
+ __global float2 *result)
+{
+}
+
+__kernel void test_kernel4(char4 c, uchar4 uc, short4 s, ushort4 us, int4 i, uint4 ui, float4 f,
+ __global float4 *result)
+{
+}
+
+__kernel void test_kernel8(char8 c, uchar8 uc, short8 s, ushort8 us, int8 i, uint8 ui, float8 f,
+ __global float8 *result)
+{
+}
+
+__kernel void test_kernel16(char16 c, uchar16 uc, short16 s, ushort16 us, int16 i, uint16 ui, float16 f,
+ __global float16 *result)
+{
+}
diff --git a/tests/vector_args.spir.ll b/tests/vector_args.spir.ll
new file mode 100644
index 0000000..9054f68
--- /dev/null
+++ b/tests/vector_args.spir.ll
@@ -0,0 +1,45 @@
+; ModuleID = 'vector_args.bc'
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir-unknown-unknown-unkown"
+
+; Function Attrs: nounwind readnone
+define void @test_kernel(i8 signext %c, i8 zeroext %uc, i16 signext %s, i16 zeroext %us, i32 %i, i32 %ui, float %f, float addrspace(1)* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @test_kernel2(<2 x i8> %c, <2 x i8> %uc, <2 x i16> %s, <2 x i16> %us, <2 x i32> %i, <2 x i32> %ui, <2 x float> %f, <2 x float> addrspace(1)* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @test_kernel4(<4 x i8> %c, <4 x i8> %uc, <4 x i16> %s, <4 x i16> %us, <4 x i32> %i, <4 x i32> %ui, <4 x float> %f, <4 x float> addrspace(1)* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @test_kernel8(<8 x i8> %c, <8 x i8> %uc, <8 x i16> %s, <8 x i16> %us, <8 x i32> %i, <8 x i32> %ui, <8 x float> %f, <8 x float> addrspace(1)* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @test_kernel16(<16 x i8> %c, <16 x i8> %uc, <16 x i16> %s, <16 x i16> %us, <16 x i32> %i, <16 x i32> %ui, <16 x float> %f, <16 x float> addrspace(1)* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{void (i8, i8, i16, i16, i32, i32, float, float addrspace(1)*)* @test_kernel}
+!1 = metadata !{void (<2 x i8>, <2 x i8>, <2 x i16>, <2 x i16>, <2 x i32>, <2 x i32>, <2 x float>, <2 x float> addrspace(1)*)* @test_kernel2}
+!2 = metadata !{void (<4 x i8>, <4 x i8>, <4 x i16>, <4 x i16>, <4 x i32>, <4 x i32>, <4 x float>, <4 x float> addrspace(1)*)* @test_kernel4}
+!3 = metadata !{void (<8 x i8>, <8 x i8>, <8 x i16>, <8 x i16>, <8 x i32>, <8 x i32>, <8 x float>, <8 x float> addrspace(1)*)* @test_kernel8}
+!4 = metadata !{void (<16 x i8>, <16 x i8>, <16 x i16>, <16 x i16>, <16 x i32>, <16 x i32>, <16 x float>, <16 x float> addrspace(1)*)* @test_kernel16}
+!5 = metadata !{metadata !"clang version 3.6.0 (http://llvm.org/git/clang.git 01adae8f440672196da28be6fce2bb4acf8ab40b) (http://llvm.org/git/llvm.git f7be7f15c1ff2882719f823fbe270e48bb0f4340)"}
diff --git a/tests/vector_args.x86.ll b/tests/vector_args.x86.ll
new file mode 100644
index 0000000..c388b5a
--- /dev/null
+++ b/tests/vector_args.x86.ll
@@ -0,0 +1,45 @@
+; ModuleID = 'vector_args.bc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone uwtable
+define void @test_kernel(i8 signext %c, i8 zeroext %uc, i16 signext %s, i16 zeroext %us, i32 %i, i32 %ui, float %f, float* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone uwtable
+define void @test_kernel2(<2 x i8>* byval nocapture align 8, <2 x i8>* byval nocapture align 8, i32 %s.coerce, i32 %us.coerce, double %i.coerce, double %ui.coerce, double %f.coerce, <2 x float>* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone uwtable
+define void @test_kernel4(i32 %c.coerce, i32 %uc.coerce, double %s.coerce, double %us.coerce, <4 x i32> %i, <4 x i32> %ui, <4 x float> %f, <4 x float>* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone uwtable
+define void @test_kernel8(double %c.coerce, double %uc.coerce, <8 x i16> %s, <8 x i16> %us, <8 x i32>* byval nocapture align 32, <8 x i32>* byval nocapture align 32, <8 x float>* byval nocapture align 32, <8 x float>* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+; Function Attrs: nounwind readnone uwtable
+define void @test_kernel16(<16 x i8> %c, <16 x i8> %uc, <16 x i16>* byval nocapture align 32, <16 x i16>* byval nocapture align 32, <16 x i32>* byval nocapture align 64, <16 x i32>* byval nocapture align 64, <16 x float>* byval nocapture align 64, <16 x float>* nocapture %result) #0 {
+entry:
+ ret void
+}
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{void (i8, i8, i16, i16, i32, i32, float, float*)* @test_kernel}
+!1 = metadata !{void (<2 x i8>*, <2 x i8>*, i32, i32, double, double, double, <2 x float>*)* @test_kernel2}
+!2 = metadata !{void (i32, i32, double, double, <4 x i32>, <4 x i32>, <4 x float>, <4 x float>*)* @test_kernel4}
+!3 = metadata !{void (double, double, <8 x i16>, <8 x i16>, <8 x i32>*, <8 x i32>*, <8 x float>*, <8 x float>*)* @test_kernel8}
+!4 = metadata !{void (<16 x i8>, <16 x i8>, <16 x i16>*, <16 x i16>*, <16 x i32>*, <16 x i32>*, <16 x float>*, <16 x float>*)* @test_kernel16}
+!5 = metadata !{metadata !"clang version 3.6.0 (http://llvm.org/git/clang.git 01adae8f440672196da28be6fce2bb4acf8ab40b) (http://llvm.org/git/llvm.git f7be7f15c1ff2882719f823fbe270e48bb0f4340)"}
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
new file mode 100644
index 0000000..a93eba7
--- /dev/null
+++ b/util/CMakeLists.txt
@@ -0,0 +1,16 @@
+include_directories( ${CMAKE_SOURCE_DIR}/include )
+
+if (HAWKING_CROSS_COMPILE)
+ include_directories( ${CMAKE_FIND_ROOT_PATH} ${HOST_USR_INCLUDE_PATH} )
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
+
+set(util_src report_timing.cpp decode_error.cpp read_binary.cpp)
+
+add_library(ocl_util STATIC ${util_src})
+
+SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+
+install(TARGETS ocl_util DESTINATION lib ${OCL_FPERMS})
+install(FILES ocl_util.h DESTINATION include ${OCL_FPERMS})
diff --git a/util/decode_error.cpp b/util/decode_error.cpp
new file mode 100644
index 0000000..39a87eb
--- /dev/null
+++ b/util/decode_error.cpp
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+const char* ocl_decode_error(int code)
+{
+ switch(code)
+ {
+ case 0: return "Success";
+ case -1: return "Device not found";
+ case -2: return "Device not available";
+ case -3: return "Compiler not available";
+ case -4: return "Mem object allocation failure";
+ case -5: return "Out of resources";
+ case -6: return "Out of host memory";
+ case -7: return "Profiling info not available";
+ case -8: return "Mem copy overlap";
+ case -9: return "Image format mismatch";
+ case -10: return "Image format not supported";
+ case -11: return "Build program failure";
+ case -12: return "Map failure";
+ case -13: return "Misaligned sub buffer offset";
+ case -14: return "Exec status error for events in wait list";
+ case -30: return "Invalid value";
+ case -31: return "Invalid device type";
+ case -32: return "Invalid platform";
+ case -33: return "Invalid device";
+ case -34: return "Invalid context";
+ case -35: return "Invalid queue properties";
+ case -36: return "Invalid command queue";
+ case -37: return "Invalid host ptr";
+ case -38: return "Invalid mem object";
+ case -39: return "Invalid image format descriptor";
+ case -40: return "Invalid image size";
+ case -41: return "Invalid sampler";
+ case -42: return "Invalid binary";
+ case -43: return "Invalid build options";
+ case -44: return "Invalid program";
+ case -45: return "Invalid program executable";
+ case -46: return "Invalid kernel name";
+ case -47: return "Invalid kernel definition";
+ case -48: return "Invalid kernel";
+ case -49: return "Invalid arg index";
+ case -50: return "Invalid arg value";
+ case -51: return "Invalid arg size";
+ case -52: return "Invalid kernel args";
+ case -53: return "Invalid work dimension";
+ case -54: return "Invalid work group size";
+ case -55: return "Invalid work item size";
+ case -56: return "Invalid global offset";
+ case -57: return "Invalid event wait list";
+ case -58: return "Invalid event";
+ case -59: return "Invalid operation";
+ case -60: return "Invalid gl object";
+ case -61: return "Invalid buffer size";
+ case -62: return "Invalid mip level";
+ case -63: return "Invalid global work size";
+ case -64: return "Invalid property";
+ default: return "Unknown";
+ }
+}
diff --git a/util/ocl_util.h b/util/ocl_util.h
new file mode 100644
index 0000000..08e70be
--- /dev/null
+++ b/util/ocl_util.h
@@ -0,0 +1,40 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#ifndef _OCL_UTIL_H_
+#define _OCL_UTIL_H_
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+using namespace cl;
+
+char *ocl_decode_error (int code);
+int ocl_read_binary (const char *filename, char* &buffer);
+void ocl_event_times (const Event &ev, const char* name);
+void ocl_relative_times(const Event &ev, const char* name, cl_ulong reference);
+
+#endif // _OCL_UTIL_H_
diff --git a/util/read_binary.cpp b/util/read_binary.cpp
new file mode 100644
index 0000000..3756b3c
--- /dev/null
+++ b/util/read_binary.cpp
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include <iostream>
+#include <fstream>
+
+int ocl_read_binary(const char *filename, char* &buffer)
+{
+ try
+ {
+ std::ifstream is;
+ is.open (filename, std::ios::binary );
+ is.seekg (0, std::ios::end);
+ int length = is.tellg();
+ is.seekg (0, std::ios::beg);
+ buffer = new char [length];
+ is.read (buffer, length);
+ is.close();
+ return length;
+ }
+ catch(...) { std::cout << "Binary read function failure" << std::endl; }
+}
diff --git a/util/report_timing.cpp b/util/report_timing.cpp
new file mode 100644
index 0000000..1f3e54a
--- /dev/null
+++ b/util/report_timing.cpp
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * Copyright (c) 2013-2014, Texas Instruments Incorporated - http://www.ti.com/
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Texas Instruments Incorporated nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#define __CL_ENABLE_EXCEPTIONS
+#include <CL/cl.hpp>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+
+using namespace cl;
+using namespace std;
+
+/******************************************************************************
+* REPORT_EVENT_TIMING - Given an OpenCL Event, report to stdout the profiling
+* info associated with the event
+******************************************************************************/
+void ocl_event_times(const Event &ev, const char* name)
+{
+ cl_ulong t_que, t_sub, t_strt, t_end;
+
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &t_que);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &t_sub);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_START, &t_strt);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_END, &t_end);
+
+ /*----------------------------------------------------------------------
+ * Normalize the time to microseconds
+ *--------------------------------------------------------------------*/
+ t_que /= 1000; t_sub /= 1000; t_strt /= 1000; t_end /= 1000;
+
+ if (!name) name = "";
+
+ cout<< name << " : Queue to Submit: " << t_sub-t_que << " us" << endl;
+ cout<< name << " : Submit to Start : " << t_strt-t_sub << " us" << endl;
+ cout<< name << " : Start to End : " << t_end-t_strt << " us" << endl;
+ cout<< endl;
+}
+
+void ocl_relative_times(const Event &ev, const char* name, cl_ulong reference)
+{
+ cl_ulong t_que, t_sub, t_strt, t_end;
+
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &t_que);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &t_sub);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_START, &t_strt);
+ ev.getProfilingInfo(CL_PROFILING_COMMAND_END, &t_end);
+
+ /*----------------------------------------------------------------------
+ * Normalize the time to microseconds
+ *--------------------------------------------------------------------*/
+ reference /= 1000;
+ t_que /= 1000; t_sub /= 1000; t_strt /= 1000; t_end /= 1000;
+
+ t_que -= reference;
+ t_sub -= reference;
+ t_strt -= reference;
+ t_end -= reference;
+
+ if (!name) name = "";
+
+ cout << name << " "
+ << t_que << " "
+ << t_sub << " "
+ << t_strt << " "
+ << t_end << endl;
+}